001: /**
002: * Licensed under the GNU LESSER GENERAL PUBLIC LICENSE, version 2.1, dated February 1999.
003: *
004: * This program is free software; you can redistribute it and/or modify
005: * it under the terms of the latest version of the GNU Lesser General
006: * Public License as published by the Free Software Foundation;
007: *
008: * This program is distributed in the hope that it will be useful,
009: * but WITHOUT ANY WARRANTY; without even the implied warranty of
010: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
011: * GNU Lesser General Public License for more details.
012: *
013: * You should have received a copy of the GNU Lesser General Public License
014: * along with this program (LICENSE.txt); if not, write to the Free Software
015: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
016: */package org.jamwiki.servlets;
017:
018: import java.io.File;
019: import java.util.Hashtable;
020: import javax.xml.parsers.SAXParserFactory;
021: import javax.xml.parsers.SAXParser;
022: import org.apache.commons.lang.StringUtils;
023: import org.jamwiki.WikiBase;
024: import org.jamwiki.model.Topic;
025: import org.jamwiki.model.TopicVersion;
026: import org.jamwiki.model.WikiUser;
027: import org.jamwiki.parser.ParserOutput;
028: import org.jamwiki.parser.ParserUtil;
029: import org.jamwiki.utils.NamespaceHandler;
030: import org.jamwiki.utils.WikiLogger;
031: import org.xml.sax.Attributes;
032: import org.xml.sax.SAXException;
033: import org.xml.sax.helpers.DefaultHandler;
034:
035: /**
036: * The purpose of this class is to load MediaWiki XML-file to the JAMWiki.
037: */
038: public class XMLTopicFactory extends DefaultHandler {
039:
040: /** Amount to indent */
041: private static final String XML_INDENT = " ";
042:
043: private final WikiUser user;
044: private final String authorIpAddress;
045: private int indentLevel = 0;
046: String virtualWiki = "en";
047: Hashtable namespaces = new Hashtable();
048: String ns14 = "Category";
049: String ns6 = "Image";
050: Integer nsKey = null;
051: String nsVal = null;
052: StringBuffer lastStr = null;
053: String pageName = null;
054: String pageText = null;
055: private String processedTopicName = null;
056: private static String lineEnd = System
057: .getProperty("line.separator");
058: private static final WikiLogger logger = WikiLogger
059: .getLogger(XMLTopicFactory.class.getName());
060:
061: /**
062: *
063: */
064: public XMLTopicFactory(String virtualWiki, WikiUser user,
065: String authorIpAddress) {
066: this .virtualWiki = virtualWiki;
067: this .authorIpAddress = authorIpAddress;
068: this .user = user;
069: }
070:
071: /**
072: *
073: */
074: public String importWikiXml(File file) throws Exception {
075: //read ini params from file
076: // TODO read all params from JAMWiki properties
077: //importProps = Environment.loadProperties(PROPERTY_FILE_NAME);
078: //For big file parsing
079: System.setProperty("entityExpansionLimit", "1000000");
080: // Use an instance of ourselves as the SAX event handler
081: // DefaultHandler handler = new XMLPageFactory();
082: // Use the default (non-validating) parser
083: SAXParserFactory factory = SAXParserFactory.newInstance();
084: try {
085: // Parse the input file
086: SAXParser saxParser = factory.newSAXParser();
087: saxParser.parse(file, this );
088: } catch (Throwable t) {
089: logger.severe("Error by importing "
090: + ((XMLTopicFactory) this ).pageName, t);
091: throw new Exception("Error by import: " + t.getMessage(), t);
092: }
093: return this .processedTopicName;
094: }
095:
096: //===========================================================
097: // SAX DocumentHandler methods
098: //===========================================================
099:
100: /**
101: *
102: */
103: public void startDocument() throws SAXException {
104: nl();
105: nl();
106: emit("START DOCUMENT");
107: nl();
108: emit("<?xml version='1.0' encoding='UTF-8'?>");
109: }
110:
111: /**
112: *
113: */
114: public void endDocument() throws SAXException {
115: nl();
116: emit("END DOCUMENT");
117: nl();
118: }
119:
120: /**
121: * start of xml-tag
122: *
123: * @param lName Local name.
124: * @param qName Qualified name.
125: */
126: public void startElement(String namespaceURI, String lName,
127: String qName, Attributes attrs) throws SAXException {
128: indentLevel++;
129: nl();
130: emit("ELEMENT: ");
131: String eName = lName;
132: if ("".equals(eName)) {
133: eName = qName;
134: }
135: emit("<" + eName);
136: lastStr = new StringBuffer();
137: if (attrs != null) {
138: for (int i = 0; i < attrs.getLength(); i++) {
139: String aName = attrs.getLocalName(i); // Attr name
140: if ("".equals(aName)) {
141: aName = attrs.getQName(i);
142: }
143: nl();
144: emit(" ATTR: ");
145: emit(aName);
146: emit("\t\"");
147: emit(attrs.getValue(i));
148: emit("\"");
149: }
150: }
151: if (attrs.getLength() > 0) {
152: nl();
153: }
154: emit(">");
155: if ("namespace".equals(eName)) { // mapping of namespaces from imported file
156: nsKey = new Integer(attrs.getValue("key"));
157: }
158: if ("page".equals(eName)) {
159: pageName = "";
160: pageText = "";
161: }
162: }
163:
164: /**
165: * end of xml-tag
166: *
167: * @param sName Simple name.
168: * @param qName Qualified name.
169: */
170: public void endElement(String namespaceURI, String sName,
171: String qName) throws SAXException {
172: nl();
173: emit("END_ELM: ");
174: emit("</" + sName + ">");
175: if ("namespace".equals(qName)) { // mapping of namespaces from imported file
176: namespaces.put(lastStr.toString().trim(), nsKey);
177: //Prepare locale namespaces
178: //WikiArticle.addNamespace(nsKey.intValue(), lastStr.trim());
179: if (nsKey.intValue() == 14) {
180: ns14 = lastStr.toString().trim();
181: }
182: if (nsKey.intValue() == 6) {
183: ns6 = lastStr.toString().trim();
184: }
185: }
186: if ("title".equals(qName)) {
187: pageName = lastStr.toString().trim();
188: }
189: if ("text".equals(qName)) {
190: pageText = lastStr.toString().trim();
191: }
192: if ("page".equals(qName)) {
193: //Create Topic
194: String sNamespace = "";
195: int namespace = 0;
196: // get wiki namespace
197: int pos = pageName.indexOf(':');
198: if (pos > -1) {
199: sNamespace = pageName.substring(0, pos);
200: if (namespaces.containsKey(sNamespace)) {
201: namespace = ((Integer) namespaces.get(sNamespace))
202: .intValue();
203: } else { // unknown namespace
204: namespace = -1;
205: }
206: } else { // main namespace
207: namespace = 0;
208: }
209: // preprocess text of topic to fit JAMWiki
210: pageText = preprocessText(pageText);
211: Topic topic = new Topic();
212: topic
213: .setName(convertArticleNameFromWikipediaToJAMWiki(pageName));
214: topic.setVirtualWiki(virtualWiki);
215: topic.setTopicContent(pageText);
216: TopicVersion topicVersion = new TopicVersion(user,
217: authorIpAddress, "imported", pageText);
218: // manage mapping bitween MediaWiki and JAMWiki namespaces
219: topic
220: .setTopicType(convertNamespaceFromMediaWikiToJAMWiki(namespace));
221: // Store topic in database
222: try {
223: ParserOutput parserOutput = ParserUtil.parserOutput(
224: pageText, virtualWiki, pageName);
225: WikiBase.getDataHandler().writeTopic(topic,
226: topicVersion, parserOutput.getCategories(),
227: parserOutput.getLinks(), true, null);
228: this .processedTopicName = topic.getName();
229: } catch (Exception e) {
230: throw new SAXException(e);
231: }
232: }
233: indentLevel--;
234: }
235:
236: /**
237: *
238: */
239: public void characters(char buf[], int offset, int len)
240: throws SAXException {
241: lastStr.append(buf, offset, len);
242: }
243:
244: /**
245: * Wrap I/O exceptions in SAX exceptions, to suit handler signature requirements.
246: */
247: private void emit(String s) throws SAXException {
248: logger.fine(s);
249: }
250:
251: /**
252: * Start a new line and indent the next line appropriately.
253: */
254: private void nl() throws SAXException {
255: logger.fine(lineEnd);
256: for (int i = 0; i < indentLevel; i++) {
257: logger.fine(XML_INDENT);
258: }
259: }
260:
261: /**
262: * convert MediaWiki namespace-id to JAMWiki namespace-id
263: * @param mediaWikiNamespaceId
264: * @return
265: */
266: private int convertNamespaceFromMediaWikiToJAMWiki(
267: int mediaWikiNamespaceId) {
268: int ret = -1;
269: switch (mediaWikiNamespaceId) {
270: case 0:
271: ret = Topic.TYPE_ARTICLE;
272: break;
273: //case 0: ret = Topic.TYPE_REDIRECT; break; //special hendling for redirects
274: case 6:
275: ret = Topic.TYPE_IMAGE;
276: break;
277: case 14:
278: ret = Topic.TYPE_CATEGORY;
279: break;
280: //case 0: ret = Topic.TYPE_FILE; break;
281: //case 0: ret = Topic.TYPE_SYSTEM_FILE; break;
282: case 10:
283: ret = Topic.TYPE_TEMPLATE;
284: break;
285: }
286: return ret;
287: }
288:
289: /**
290: *
291: */
292: private String getJAMWikiNamespaceById(int jamWikiNamespaceId) {
293: String ret = "";
294: switch (jamWikiNamespaceId) {
295: case Topic.TYPE_IMAGE:
296: ret = NamespaceHandler.NAMESPACE_IMAGE;
297: break;
298: case Topic.TYPE_CATEGORY:
299: ret = NamespaceHandler.NAMESPACE_CATEGORY;
300: break;
301: case Topic.TYPE_TEMPLATE:
302: ret = NamespaceHandler.NAMESPACE_TEMPLATE;
303: break;
304: }
305: return ret;
306: }
307:
308: /**
309: *
310: */
311: private String convertArticleNameFromWikipediaToJAMWiki(
312: String fullName) {
313: String ret = fullName;
314: String sNamespace = "";
315: String sJAMNamespace = "";
316: String sTitle = pageName;
317: int pos = pageName.indexOf(':');
318: if (pos > -1) {
319: sNamespace = pageName.substring(0, pos);
320: if (namespaces.containsKey(sNamespace)) {
321: int namespace = ((Integer) namespaces.get(sNamespace))
322: .intValue();
323: sTitle = pageName.substring(pos + 1);
324: sJAMNamespace = getJAMWikiNamespaceById(convertNamespaceFromMediaWikiToJAMWiki(namespace));
325: if (sJAMNamespace.length() > 0) {
326: ret = sJAMNamespace + ":" + sTitle;
327: } else {//equivalent namespace in JAMWiki not found. Use original name
328: ret = sNamespace + ":" + sTitle;
329: }
330: } else { //namespace not found
331: ret = pageName;
332: }
333: } else { //main namespace
334: ret = pageName;
335: }
336: return ret;
337: }
338:
339: /**
340: * preprocess the text of topic
341: * convert all namespaces names from MediaWiki to JAMWiki local representation
342: * and so on...
343: */
344: public String preprocessText(String text) {
345: String ret = text;
346: // convert all namespaces names from MediaWiki to JAMWiki local representation
347: ret = StringUtils.replace(ret, "[[category:", "[["
348: + NamespaceHandler.NAMESPACE_CATEGORY + ":");
349: if (!"Category".equals(NamespaceHandler.NAMESPACE_CATEGORY)) {
350: ret = StringUtils.replace(ret, "[[Category:", "[["
351: + NamespaceHandler.NAMESPACE_CATEGORY + ":");
352: }
353: ret = StringUtils.replace(ret, "[[" + ns14 + ":", "[["
354: + NamespaceHandler.NAMESPACE_CATEGORY + ":");
355: ret = StringUtils.replace(ret, "[[image:", "[["
356: + NamespaceHandler.NAMESPACE_IMAGE + ":");
357: if (!"Image".equals(NamespaceHandler.NAMESPACE_CATEGORY)) {
358: ret = StringUtils.replace(ret, "[[Image:", "[["
359: + NamespaceHandler.NAMESPACE_IMAGE + ":");
360: }
361: ret = StringUtils.replace(ret, "[[" + ns6 + ":", "[["
362: + NamespaceHandler.NAMESPACE_IMAGE + ":");
363:
364: return ret;
365: }
366: }
|