Source Code Cross Referenced for JspWikiImporter.java in » Content-Management-System » daisy » org » outerj » daisy » jspwiki_import » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » daisy » org.outerj.daisy.jspwiki_import
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * Copyright 2004 Outerthought bvba and Schaubroeck nv
003:         *
004:         * Licensed under the Apache License, Version 2.0 (the "License");
005:         * you may not use this file except in compliance with the License.
006:         * You may obtain a copy of the License at
007:         *
008:         *     http://www.apache.org/licenses/LICENSE-2.0
009:         *
010:         * Unless required by applicable law or agreed to in writing, software
011:         * distributed under the License is distributed on an "AS IS" BASIS,
012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013:         * See the License for the specific language governing permissions and
014:         * limitations under the License.
015:         */
016:        package org.outerj.daisy.jspwiki_import;
017:
018:        import org.apache.commons.httpclient.HttpClient;
019:        import org.apache.commons.httpclient.HttpMethod;
020:        import org.apache.commons.httpclient.HttpStatus;
021:        import org.apache.commons.httpclient.methods.GetMethod;
022:        import org.apache.xerces.parsers.DOMParser;
023:        import org.cyberneko.html.HTMLConfiguration;
024:        import org.xml.sax.*;
025:        import org.xml.sax.helpers.AttributesImpl;
026:        import org.jaxen.dom.DOMXPath;
027:        import org.w3c.dom.Element;
028:        import org.w3c.dom.NodeList;
029:        import org.w3c.dom.Node;
030:        import org.outerj.daisy.htmlcleaner.HtmlCleanerFactory;
031:        import org.outerj.daisy.htmlcleaner.HtmlCleanerTemplate;
032:        import org.outerj.daisy.htmlcleaner.HtmlCleaner;
033:        import org.outerj.daisy.repository.*;
034:        import org.outerj.daisy.repository.clientimpl.RemoteRepositoryManager;
035:
036:        import javax.xml.transform.dom.DOMSource;
037:        import javax.xml.transform.Transformer;
038:        import javax.xml.transform.stream.StreamResult;
039:        import javax.xml.transform.sax.SAXTransformerFactory;
040:        import javax.xml.transform.sax.TransformerHandler;
041:        import javax.xml.transform.sax.SAXResult;
042:        import javax.xml.parsers.DocumentBuilderFactory;
043:        import javax.xml.parsers.DocumentBuilder;
044:        import javax.xml.parsers.SAXParserFactory;
045:        import javax.xml.parsers.SAXParser;
046:        import java.util.*;
047:        import java.io.*;
048:        import java.net.URLDecoder;
049:
050:        /**
051:         * Standalone app to import contents of a JSP Wiki into daisy. Currently
052:         * only written with the purpose of importing the Cocoon Wiki content to
053:         * have some meaningful, and meaningful-sized testdata.
054:         *
055:         * <p>The import runs in two passes: first all wiki pages are imported
056:         * into daisy, then links are translated from wiki page names to daisy
057:         * document ids.
058:         *
059:         * <p>To run, after maven build, execute target/runimport.sh.
060:         *
061:         * <p>To make this usable as a generic utility, at least the hardcoded
062:         * wiki location and daisy username, collection and url should be specifiable
063:         * using command line parameters.
064:         *
065:         */
066:        public class JspWikiImporter {
067:            private String wikiPageURL = "http://wiki.cocoondev.org/Wiki.jsp?page=";
068:            private String collectionName = "cocoon";
069:            private String daisyUser = "jspwiki-import";
070:            private String daisyPassword = "topsecret";
071:            private HashSet allPageNames = new HashSet();
072:            private DocumentBuilder documentBuilder;
073:            private HtmlCleanerTemplate htmlCleanerTemplate;
074:            private SAXTransformerFactory transformerFactory = (SAXTransformerFactory) SAXTransformerFactory
075:                    .newInstance();
076:            private Repository repository;
077:            private HashMap importPages = new HashMap();
078:            private HashMap importedImages = new HashMap();
079:            private HashMap importedAttachments = new HashMap();
080:            private DocumentCollection collection;
081:            private static HashSet skipPages = new HashSet();
082:            static {
083:                skipPages.add("UndefinedPages");
084:                skipPages.add("UnusedPages");
085:                skipPages.add("IndexPage");
086:                skipPages.add("RecentChanges");
087:                skipPages.add("FullRecentChanges");
088:            }
089:
090:            public static void main(String[] args) throws Exception {
091:                new JspWikiImporter().run();
092:            }
093:
094:            public void run() throws Exception {
095:                // initialize some stuff
096:                System.out.println("Doing preparations...");
097:                documentBuilder = DocumentBuilderFactory.newInstance()
098:                        .newDocumentBuilder();
099:                File htmlCleanerConfig = new File(
100:                        "../daisywiki/frontend/src/cocoon/webapp/daisy/resources/conf/htmlcleaner.xml");
101:                htmlCleanerTemplate = new HtmlCleanerFactory()
102:                        .buildTemplate(new InputSource(new FileInputStream(
103:                                htmlCleanerConfig)));
104:
105:                // connect to daisy
106:                System.out.println("Connecting to daisy...");
107:                Credentials credentials = new Credentials(daisyUser,
108:                        daisyPassword);
109:                RepositoryManager repositoryManager = new RemoteRepositoryManager(
110:                        "http://localhost:9263", credentials);
111:                repository = repositoryManager.getRepository(credentials);
112:                collection = repository.getCollectionManager()
113:                        .getCollectionByName(collectionName, false);
114:
115:                // load wiki page names
116:                System.out.println("Fetching list of all pages on the wiki...");
117:                loadPageNames();
118:                System.out.println(allPageNames.size()
119:                        + " pages found on the wiki.");
120:                System.out.println();
121:
122:                String[] pages = (String[]) allPageNames
123:                        .toArray(new String[allPageNames.size()]);
124:                for (int i = 0; i < pages.length; i++) {
125:                    if (pages[i].startsWith("Wyona")) {
126:                        System.out.println("Skipping page " + pages[i]);
127:                    } else if (skipPages.contains(pages[i])) {
128:                        System.out.println("Skipping page " + pages[i]);
129:                    } else {
130:                        System.out.println("Fetching page " + pages[i]
131:                                + "... (" + i + " of " + pages.length + ")");
132:                        byte[] pageData = fetchPage(pages[i]);
133:
134:                        System.out.println("Parsing and cleaning HTML...");
135:                        org.w3c.dom.Document pageDocument = parseHtml(pageData);
136:                        DOMXPath xpath = new DOMXPath("//div[@class='content']");
137:                        Element contentDiv = (Element) xpath
138:                                .selectSingleNode(pageDocument);
139:                        if (contentDiv == null)
140:                            throw new Exception("No content found in page "
141:                                    + pages[i]);
142:                        String contentData = serialize(contentDivToDoc(contentDiv));
143:                        byte[] cleanedContent = clean(contentData);
144:
145:                        System.out.println("Storing page in Daisy...");
146:                        Document document = repository.createDocument(pages[i],
147:                                "SimpleDocument");
148:                        document.setPart("SimpleDocumentContent", "text/xml",
149:                                cleanedContent);
150:                        document.addToCollection(collection);
151:                        document.save();
152:                        importPages.put(pages[i], new Long(document.getId()));
153:                        System.out.println("Done\n");
154:                    }
155:                }
156:
157:                System.out.println("\n\nWILL NOW START LINK TRANSLATION\n\n");
158:
159:                Iterator importPagesIt = importPages.entrySet().iterator();
160:                while (importPagesIt.hasNext()) {
161:                    Map.Entry entry = (Map.Entry) importPagesIt.next();
162:                    String pageName = (String) entry.getKey();
163:                    long pageId = ((Long) entry.getValue()).longValue();
164:
165:                    System.out.println("Translating links for document "
166:                            + pageName + "...");
167:                    Document document = repository.getDocument(pageId, true);
168:                    byte[] pageData = document.getPart("SimpleDocumentContent")
169:                            .getData();
170:                    byte[] newData = clean(translateLinks(pageData));
171:                    document.setPart("SimpleDocumentContent", "text/xml",
172:                            newData);
173:                    document.save();
174:                    System.out.println("Done\n");
175:                }
176:
177:            }
178:
179:            private byte[] clean(String htmlData) throws Exception {
180:                HtmlCleaner cleaner = htmlCleanerTemplate.newHtmlCleaner();
181:                return cleaner.cleanToByteArray(htmlData);
182:            }
183:
184:            private org.w3c.dom.Document contentDivToDoc(Element contentDiv) {
185:                org.w3c.dom.Document doc = documentBuilder.newDocument();
186:                Element htmlEl = doc.createElementNS(null, "html");
187:                doc.appendChild(htmlEl);
188:                Element bodyEl = doc.createElementNS(null, "body");
189:                htmlEl.appendChild(bodyEl);
190:                NodeList childNodes = contentDiv.getChildNodes();
191:                for (int i = 0; i < childNodes.getLength(); i++) {
192:                    Node node = childNodes.item(i);
193:                    boolean append = true;
194:                    if (node instanceof  Element
195:                            && node.getLocalName().equals("h1")) {
196:                        Element divEl = (Element) node;
197:                        if (divEl.getAttribute("class").equals("pagename")) {
198:                            append = false;
199:                        }
200:                    } else if (node instanceof  Element
201:                            && node.getLocalName().equals("div")) {
202:                        Element divEl = (Element) node;
203:                        // detect end of content by presence of a div with class bottom.
204:                        if (divEl.getAttribute("class").equals("bottom")) {
205:                            return doc;
206:                        }
207:                    }
208:                    if (append)
209:                        bodyEl.appendChild(doc.importNode(node, true));
210:                }
211:                return doc;
212:            }
213:
214:            private String serialize(org.w3c.dom.Document doc) throws Exception {
215:                TransformerHandler serializer = transformerFactory
216:                        .newTransformerHandler();
217:                StringWriter writer = new StringWriter();
218:                serializer.setResult(new StreamResult(writer));
219:
220:                Transformer streamer = transformerFactory.newTransformer();
221:                streamer.transform(new DOMSource(doc), new SAXResult(
222:                        new ExtraCleanup(serializer)));
223:                return writer.toString();
224:            }
225:
226:            private void loadPageNames() throws Exception {
227:                byte[] indexPageData = fetchPage("IndexPage");
228:                org.w3c.dom.Document document = parseHtml(indexPageData);
229:                DOMXPath xpath = new DOMXPath("//a[@class='wikipage']");
230:                List nodes = xpath.selectNodes(document);
231:                Iterator nodesIt = nodes.iterator();
232:                while (nodesIt.hasNext()) {
233:                    Element element = (Element) nodesIt.next();
234:                    String href = element.getAttribute("href");
235:                    if (href.startsWith(wikiPageURL))
236:                        allPageNames.add(href.substring(wikiPageURL.length()));
237:                }
238:            }
239:
240:            private byte[] fetchPage(String pageName) throws Exception {
241:                HttpClient client = new HttpClient();
242:                HttpMethod method = new GetMethod(wikiPageURL + pageName);
243:                int status = client.executeMethod(method);
244:                if (status != HttpStatus.SC_OK)
245:                    throw new Exception("Problem retrieving wiki page "
246:                            + pageName + " : " + method.getStatusCode() + " : "
247:                            + HttpStatus.getStatusText(method.getStatusCode()));
248:                return method.getResponseBody();
249:            }
250:
251:            private org.w3c.dom.Document parseHtml(byte[] data)
252:                    throws Exception {
253:                DOMParser parser = new DOMParser(new HTMLConfiguration());
254:                parser.setFeature("http://xml.org/sax/features/namespaces",
255:                        true);
256:                parser
257:                        .setFeature(
258:                                "http://cyberneko.org/html/features/override-namespaces",
259:                                false);
260:                parser.setFeature(
261:                        "http://cyberneko.org/html/features/insert-namespaces",
262:                        false);
263:                parser.setProperty(
264:                        "http://cyberneko.org/html/properties/names/elems",
265:                        "lower");
266:                parser.setProperty(
267:                        "http://cyberneko.org/html/properties/names/attrs",
268:                        "lower");
269:
270:                parser.parse(new InputSource(new ByteArrayInputStream(data)));
271:                return parser.getDocument();
272:            }
273:
274:            private String translateLinks(byte[] data) throws Exception {
275:                TransformerHandler serializer = transformerFactory
276:                        .newTransformerHandler();
277:                StringWriter writer = new StringWriter();
278:                serializer.setResult(new StreamResult(writer));
279:
280:                SAXParserFactory parserFactory = SAXParserFactory.newInstance();
281:                parserFactory.setNamespaceAware(true);
282:                SAXParser parser = parserFactory.newSAXParser();
283:                parser.getXMLReader().setContentHandler(
284:                        new LinkTranslator(serializer));
285:                parser.getXMLReader().parse(
286:                        new InputSource(new ByteArrayInputStream(data)));
287:
288:                return writer.toString();
289:            }
290:
291:            class AbstractTransformer implements  ContentHandler {
292:                protected ContentHandler consumer;
293:
294:                public AbstractTransformer(ContentHandler consumer) {
295:                    this .consumer = consumer;
296:                }
297:
298:                public void endDocument() throws SAXException {
299:                    consumer.endDocument();
300:                }
301:
302:                public void startDocument() throws SAXException {
303:                    consumer.startDocument();
304:                }
305:
306:                public void characters(char ch[], int start, int length)
307:                        throws SAXException {
308:                    consumer.characters(ch, start, length);
309:                }
310:
311:                public void ignorableWhitespace(char ch[], int start, int length)
312:                        throws SAXException {
313:                    consumer.ignorableWhitespace(ch, start, length);
314:                }
315:
316:                public void endPrefixMapping(String prefix) throws SAXException {
317:                    consumer.endPrefixMapping(prefix);
318:                }
319:
320:                public void skippedEntity(String name) throws SAXException {
321:                    consumer.skippedEntity(name);
322:                }
323:
324:                public void setDocumentLocator(Locator locator) {
325:                    consumer.setDocumentLocator(locator);
326:                }
327:
328:                public void processingInstruction(String target, String data)
329:                        throws SAXException {
330:                    consumer.processingInstruction(target, data);
331:                }
332:
333:                public void startPrefixMapping(String prefix, String uri)
334:                        throws SAXException {
335:                    consumer.startPrefixMapping(prefix, uri);
336:                }
337:
338:                public void endElement(String namespaceURI, String localName,
339:                        String qName) throws SAXException {
340:                    consumer.endElement(namespaceURI, localName, qName);
341:                }
342:
343:                public void startElement(String namespaceURI, String localName,
344:                        String qName, Attributes atts) throws SAXException {
345:                    consumer.startElement(namespaceURI, localName, qName, atts);
346:                }
347:            }
348:
349:            class LinkTranslator extends AbstractTransformer {
350:
351:                public LinkTranslator(ContentHandler consumer) {
352:                    super (consumer);
353:                }
354:
355:                public void startElement(String uri, String localName,
356:                        String qName, Attributes attributes)
357:                        throws SAXException {
358:                    if (uri.equals("") && localName.equals("a")) {
359:                        int index = attributes.getIndex("href");
360:                        String href = (index != -1 ? attributes.getValue(index)
361:                                : null);
362:                        if (href != null && href.startsWith(wikiPageURL)) {
363:                            String linkedPage = href.substring(wikiPageURL
364:                                    .length());
365:                            Long linkedPageId = (Long) importPages
366:                                    .get(linkedPage);
367:                            System.out.println("attempt translation of "
368:                                    + linkedPage + " to " + linkedPageId);
369:                            if (linkedPageId != null) {
370:                                AttributesImpl newAttrs = new AttributesImpl(
371:                                        attributes);
372:                                newAttrs.setAttribute(
373:                                        newAttrs.getIndex("href"), "", "href",
374:                                        "href", "CDATA", "daisy:"
375:                                                + linkedPageId.longValue());
376:                                attributes = newAttrs;
377:                            }
378:                        }
379:                    }
380:                    consumer.startElement(uri, localName, qName, attributes);
381:                }
382:            }
383:
384:            class ExtraCleanup extends AbstractTransformer {
385:                private boolean dropNextImgEndTag = false;
386:
387:                public ExtraCleanup(ContentHandler consumer) {
388:                    super (consumer);
389:                }
390:
391:                public void startElement(String namespaceURI, String localName,
392:                        String qName, Attributes atts) throws SAXException {
393:                    if (namespaceURI.equals("")
394:                            && localName.equals("img")
395:                            && ("http://wiki.cocoondev.org/images/out.png"
396:                                    .equals(atts.getValue("src")) || "images/attachment_small.png"
397:                                    .equals(atts.getValue("src")))) {
398:                        dropNextImgEndTag = true;
399:                        // skip element
400:                    } else if (namespaceURI.equals("")
401:                            && localName.equals("img")) {
402:                        String src = atts.getValue("src");
403:                        if (src != null) {
404:                            if (importedImages.containsKey(src)) {
405:                                AttributesImpl newAttrs = new AttributesImpl();
406:                                newAttrs.addAttribute("", "src", "src",
407:                                        "CDATA", "daisy:"
408:                                                + importedImages.get(src));
409:                            } else {
410:                                try {
411:                                    HttpClient client = new HttpClient();
412:                                    HttpMethod method = new GetMethod(src);
413:                                    int status = client.executeMethod(method);
414:                                    if (status >= 300 && status < 400) {
415:                                        method = new GetMethod(method
416:                                                .getResponseHeader("location")
417:                                                .getValue());
418:                                        status = client.executeMethod(method);
419:                                    }
420:                                    if (status != HttpStatus.SC_OK)
421:                                        throw new Exception(
422:                                                "Problem retrieving image "
423:                                                        + src
424:                                                        + " : "
425:                                                        + method
426:                                                                .getStatusCode()
427:                                                        + " : "
428:                                                        + HttpStatus
429:                                                                .getStatusText(method
430:                                                                        .getStatusCode()));
431:                                    byte[] data = method.getResponseBody();
432:                                    String name = getImageName(src);
433:                                    Document imageDocument = repository
434:                                            .createDocument(name, "Image");
435:                                    imageDocument.setPart("ImageData", method
436:                                            .getResponseHeader("Content-Type")
437:                                            .getValue(), data);
438:                                    imageDocument.addToCollection(collection);
439:                                    imageDocument.save();
440:                                    importedImages.put(src, String
441:                                            .valueOf(imageDocument.getId()));
442:                                    AttributesImpl newAttrs = new AttributesImpl();
443:                                    newAttrs.addAttribute("", "src", "src",
444:                                            "CDATA", "daisy:"
445:                                                    + imageDocument.getId());
446:                                    super .startElement("", "img", "img",
447:                                            newAttrs);
448:                                    System.out.println("Imported image " + src
449:                                            + " as " + name);
450:                                } catch (Exception e) {
451:                                    throw new SAXException(
452:                                            "Error getting image " + src, e);
453:                                }
454:                            }
455:                        }
456:                    } else if (namespaceURI.equals("") && localName.equals("a")
457:                            && "attachment".equals(atts.getValue("class"))) {
458:                        String src = atts.getValue("href");
459:                        String decodedSrc = null;
460:                        try {
461:                            decodedSrc = URLDecoder.decode(src, "UTF-8");
462:                        } catch (UnsupportedEncodingException e) {
463:                            throw new SAXException(e);
464:                        }
465:                        if (importedAttachments.containsKey(src)) {
466:                            AttributesImpl newAttrs = new AttributesImpl();
467:                            newAttrs.addAttribute("", "src", "src", "CDATA",
468:                                    "daisy:" + importedAttachments.get(src));
469:                        } else {
470:                            try {
471:                                HttpClient client = new HttpClient();
472:                                HttpMethod method = new GetMethod(src);
473:                                int status = client.executeMethod(method);
474:                                if (status != HttpStatus.SC_OK)
475:                                    throw new Exception(
476:                                            "Problem retrieving attachment "
477:                                                    + src
478:                                                    + " : "
479:                                                    + method.getStatusCode()
480:                                                    + " : "
481:                                                    + HttpStatus
482:                                                            .getStatusText(method
483:                                                                    .getStatusCode()));
484:                                byte[] data = method.getResponseBody();
485:                                String name = getImageName(decodedSrc);
486:                                Document attachmentDocument = repository
487:                                        .createDocument(name, "Attachment");
488:                                attachmentDocument.setPart("AttachmentData",
489:                                        method
490:                                                .getResponseHeader(
491:                                                        "Content-Type")
492:                                                .getValue(), data);
493:                                attachmentDocument.addToCollection(collection);
494:                                attachmentDocument.save();
495:                                importedAttachments.put(src, String
496:                                        .valueOf(attachmentDocument.getId()));
497:                                AttributesImpl newAttrs = new AttributesImpl();
498:                                newAttrs.addAttribute("", "href", "href",
499:                                        "CDATA", "daisy:"
500:                                                + attachmentDocument.getId());
501:                                super .startElement("", "a", "a", newAttrs);
502:                                System.out.println("Imported attachment " + src
503:                                        + " as " + name);
504:                            } catch (Exception e) {
505:                                throw new SAXException(
506:                                        "Error getting attachment " + src, e);
507:                            }
508:                        }
509:                    } else {
510:                        super 
511:                                .startElement(namespaceURI, localName, qName,
512:                                        atts);
513:                    }
514:                }
515:
516:                private String getImageName(String src) {
517:                    String name = src.substring(src.lastIndexOf('/') + 1);
518:                    int dotpos = name.lastIndexOf('.');
519:                    if (dotpos != -1) {
520:                        name = name.substring(0, dotpos);
521:                    }
522:                    return name;
523:                }
524:
525:                public void endElement(String namespaceURI, String localName,
526:                        String qName) throws SAXException {
527:                    if (dropNextImgEndTag && namespaceURI.equals("")
528:                            && localName.equals("img")) {
529:                        // skip
530:                        dropNextImgEndTag = false;
531:                        // note that this code assumes img elements are never nested.
532:                    } else {
533:                        super.endElement(namespaceURI, localName, qName);
534:                    }
535:                }
536:            }
537:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.