001: // wikiParser.java
002: // ---------
003: // part of YaCy
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2007
007: // Created 22.02.2007
008: //
009: // This file is contributed by Franz Brausze
010: //
011: // $LastChangedDate: $
012: // $LastChangedRevision: $
013: // $LastChangedBy: $
014: //
015: // This program is free software; you can redistribute it and/or modify
016: // it under the terms of the GNU General Public License as published by
017: // the Free Software Foundation; either version 2 of the License, or
018: // (at your option) any later version.
019: //
020: // This program is distributed in the hope that it will be useful,
021: // but WITHOUT ANY WARRANTY; without even the implied warranty of
022: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
023: // GNU General Public License for more details.
024: //
025: // You should have received a copy of the GNU General Public License
026: // along with this program; if not, write to the Free Software
027: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
028: //
029: // Using this software in any meaning (reading, learning, copying, compiling,
030: // running) means that you agree that the Author(s) is (are) not responsible
031: // for cost, loss of data or any harm that may be caused directly or indirectly
032: // by usage of this softare or this documentation. The usage of this software
033: // is on your own risk. The installation and usage (starting/running) of this
034: // software may allow other people or application to access your computer and
035: // any attached devices and is highly dependent on the configuration of the
036: // software which must be done by the user of the software; the author(s) is
037: // (are) also not responsible for proper configuration and usage of the
038: // software, even if provoked by documentation provided together with
039: // the software.
040: //
041: // Any changes to this file according to the GPL as documented in the file
042: // gpl.txt aside this file in the shipment you received can be done to the
043: // lines that follows this copyright notice here, but changes must not be
044: // done inside the copyright notive above. A re-distribution must contain
045: // the intact and unchanged copyright notice.
046: // Contributions and changes to the program code must be marked as such.
047:
048: package de.anomic.data.wiki;
049:
050: import java.io.UnsupportedEncodingException;
051: import java.util.ArrayList;
052: import java.util.regex.Matcher;
053:
054: import de.anomic.data.wiki.tokens.DefinitionListToken;
055: import de.anomic.data.wiki.tokens.LinkToken;
056: import de.anomic.data.wiki.tokens.ListToken;
057: import de.anomic.data.wiki.tokens.SimpleToken;
058: import de.anomic.data.wiki.tokens.TableToken;
059: import de.anomic.data.wiki.tokens.Token;
060: import de.anomic.plasma.plasmaSwitchboard;
061: import de.anomic.yacy.yacyCore;
062:
063: public class knwikiParser implements wikiParser {
064:
065: public Token[] tokens;
066: private String[] BEs;
067: private final plasmaSwitchboard sb;
068:
069: public knwikiParser(plasmaSwitchboard sb) {
070: this .sb = sb;
071: }
072:
073: public static void main(String[] args) {
074: String text = "===T<pre>itle===\n"
075: + "==blubb== was ==ein '''shice'''==...och.bla\n"
076: + "* ein \n" + "*==test=</pre>=\n" + "** doppelt\n"
077: + "* ''tess*sst''\n" + "*** xyz\n" + "=]*** huch\n"
078: + "* ehehe***\n" + "* blubb\n" + "bliblablo\n\n\n"
079: + "* blubb\n" + "{|border=-1\n" + "|-\n"
080: + "||bla|| blubb\n" + "|-\n"
081: + "||align center|och||huch||\n" + "|}\n" + "\n"
082: + "# bla\n" + "# blubb\n" + "'''''ehehehe''''', ne?!\n"
083: + "[http://www/index.html,ne?!] -\n"
084: + "[[Image:blubb|BLA]] ---- och\n" + " blubb1\n"
085: + " blubb2\n" + ":doppel-blubb[= huch =]\n"
086: + ";hier:da\n" + ";dort:und so\n"
087: + ";;und:doppelt\n\n\n\n" + "[[Image:blubb|BLA]]";
088: // text = "[=\n=]* bla";
089: String t = "[=] ein fucking [= test =]-text[=,ne?!=] joa, [=alles=]wunderbar,"
090: + "[=denk ich=] mal =]";
091: long l = System.currentTimeMillis();
092: t = new knwikiParser(null).parse((args.length > 0) ? args[0]
093: : text, "localhost:8080");
094: System.out.println("parsing time: "
095: + (System.currentTimeMillis() - l) + " ms");
096: System.out.println("--- --- ---");
097: System.out.println(t);
098: }
099:
100: public String transform(String content) {
101: return parse(content, null);
102: }
103:
104: public String transform(String content, plasmaSwitchboard sb) {
105: return parse(content, null);
106: }
107:
108: public String transform(byte[] content)
109: throws UnsupportedEncodingException {
110: return parse(new String(content, "UTF-8"), null);
111: }
112:
113: public String transform(byte[] content, String encoding,
114: plasmaSwitchboard switchboard)
115: throws UnsupportedEncodingException {
116: return parse(new String(content, encoding), null);
117: }
118:
119: public String transform(byte[] content, String encoding)
120: throws UnsupportedEncodingException {
121: return parse(new String(content, encoding), null);
122: }
123:
124: public String transform(byte[] text, String encoding,
125: String publicAddress) throws UnsupportedEncodingException {
126: return parse(new String(text, encoding), publicAddress);
127: }
128:
129: public String transform(String text, String publicAddress) {
130: return parse(text, publicAddress);
131: }
132:
133: public String parse(String text, String publicAddress) {
134: tokens = new Token[] {
135: new SimpleToken('=', '=', new String[][] { null,
136: { "h2" }, { "h3" }, { "h4" } }, true),
137: new SimpleToken('\'', '\'', new String[][] { null,
138: { "i" }, { "b" }, null, { "b", "i" } }, false),
139: new LinkToken((publicAddress == null) ? yacyCore.seedDB
140: .mySeed().getPublicAddress() : publicAddress,
141: "Wiki.html?page=", sb),
142: new ListToken('*', "ul"), new ListToken('#', "ol"),
143: new ListToken(':', "blockquote", null),
144: new ListToken(' ', null, "tt", false),
145: new DefinitionListToken(), new TableToken() };
146: ArrayList<String> r = new ArrayList<String>();
147: for (int i = 0, k, j; i < tokens.length; i++)
148: if (tokens[i].getBlockElementNames() != null)
149: for (j = 0; j < tokens[i].getBlockElementNames().length; j++) {
150: if (tokens[i].getBlockElementNames()[j] == null)
151: continue;
152: if ((k = tokens[i].getBlockElementNames()[j]
153: .indexOf(' ')) > 1) {
154: r.add(tokens[i].getBlockElementNames()[j]
155: .substring(0, k));
156: } else {
157: r.add(tokens[i].getBlockElementNames()[j]);
158: }
159: }
160: r.add("hr");
161: BEs = (String[]) r.toArray(new String[r.size()]);
162:
163: Text[] tt = Text.split2Texts(text, "[=", "=]");
164: for (int i = 0; i < tt.length; i += 2)
165: tt[i].setText(parseUnescaped(tt[i].getText()));
166: text = Text.mergeTexts(tt);
167:
168: tt = Text.split2Texts(text, "<pre>", "</pre>");
169: for (int i = 0; i < tt.length; i += 2)
170: tt[i].setText(replaceBRs(tt[i].getText()));
171: return Text.mergeTexts(tt);
172: }
173:
174: public String parseUnescaped(String text) {
175: Token st;
176: Matcher m;
177: StringBuffer sb;
178: for (int i = 0; i < tokens.length; i++) {
179: st = tokens[i];
180: for (int j = 0; j < st.getRegex().length; j++) {
181: m = st.getRegex()[j].matcher(text);
182: sb = new StringBuffer();
183: while (m.find())
184: try {
185: //System.out.print("found " + st.getClass().getSimpleName() + ": " +
186: // m.group().replaceAll("\n", "\\\\n").replaceAll("\t", " ") + ", ");
187: if (st.setText(m.group(), j)) {
188: // System.out.println("usable");
189: } else {
190: // System.out.println("not usable");
191: continue;
192: }
193: m.appendReplacement(sb,
194: (st.getMarkup() == null) ? m.group()
195: : st.getMarkup());
196: } catch (wikiParserException e) {
197: m.appendReplacement(sb, st.getText());
198: }
199: text = new String(m.appendTail(sb));
200: }
201: }
202: return text.replaceAll("----", "<hr />");
203: }
204:
205: private String replaceBRs(String text) {
206: StringBuffer sb = new StringBuffer(text.length());
207: String[] tt = text.split("\n");
208: boolean replace;
209: for (int i = 0, j; i < tt.length; i++) {
210: replace = true;
211: for (j = 0; j < BEs.length; j++)
212: if (tt[i].endsWith(BEs[j] + ">")) {
213: replace = false;
214: break;
215: }
216: sb.append(tt[i]);
217: if (i < tt.length - 1) {
218: if (replace)
219: sb.append("<br />");
220: sb.append("\n");
221: }
222: }
223: return new String(sb);
224: }
225:
226: private static class Text {
227:
228: public static final String escapeNewLine = "@";
229:
230: private String text;
231: private final boolean escaped;
232: private final boolean nl;
233:
234: public Text(String text, boolean escaped, boolean newLineBefore) {
235: this .text = text;
236: this .escaped = escaped;
237: this .nl = newLineBefore;
238: }
239:
240: public String setTextPlain(String text) {
241: return this .text = text;
242: }
243:
244: public String setText(String text) {
245: if (this .nl)
246: this .text = text.substring(escapeNewLine.length());
247: else
248: this .text = text;
249: return this .text;
250: }
251:
252: public String getTextPlain() {
253: return this .text;
254: }
255:
256: public String getText() {
257: if (this .nl)
258: return escapeNewLine + this .text;
259: else
260: return this .text;
261: }
262:
263: public String toString() {
264: return this .text;
265: }
266:
267: public boolean isEscaped() {
268: return this .escaped;
269: }
270:
271: public boolean isNewLineBefore() {
272: return this .nl;
273: }
274:
275: private static Text[] split2Texts(String text,
276: String escapeBegin, String escapeEnd) {
277: if (text == null)
278: return null;
279: if (text.length() < 2)
280: return new Text[] { new Text(text, false, true) };
281:
282: int startLen = escapeBegin.length();
283: int endLen = escapeEnd.length();
284: ArrayList<Text> r = new ArrayList<Text>();
285: boolean escaped = text.startsWith(escapeBegin);
286: if (escaped)
287: r.add(new Text("", false, true));
288: int i, j = 0;
289: while ((i = text.indexOf((escaped) ? escapeEnd
290: : escapeBegin, j)) > -1) {
291: r.add(resolve2Text(text, escaped, (j > 0) ? j
292: + ((escaped) ? startLen : endLen) : 0, i,
293: escapeEnd));
294: j = i;
295: escaped = !escaped;
296: }
297: r.add(resolve2Text(text, escaped, (escaped) ? j
298: : (j > 0) ? j + endLen : 0, -1, escapeEnd));
299: return (Text[]) r.toArray(new Text[r.size()]);
300: }
301:
302: private static Text resolve2Text(String text, boolean escaped,
303: int from, int to, String escapeEnd) {
304: if (to == -1)
305: to = text.length();
306: return new Text(text.substring(from, to), escaped,
307: from < escapeEnd.length() + 2
308: || (!escaped && text.charAt(from
309: - escapeEnd.length() - 1) == '\n'));
310: }
311:
312: private static String mergeTexts(Text[] texts) {
313: StringBuffer sb = new StringBuffer();
314: for (int n = 0; n < texts.length; n++)
315: sb.append(texts[n].getTextPlain());
316: return new String(sb);
317: }
318: }
319: }
|