0001: package org.mmbase.util.transformers;
0002:
0003: import java.io.*;
0004: import java.util.HashMap;
0005: import java.util.Map;
0006:
0007: import javax.xml.transform.stream.StreamSource;
0008: import javax.xml.transform.stream.StreamResult;
0009:
0010: import java.util.regex.*;
0011:
0012: import org.mmbase.util.StringObject;
0013: import org.mmbase.util.ResourceLoader;
0014: import org.mmbase.util.XSLTransformer;
0015:
0016: import org.mmbase.util.logging.Logger;
0017: import org.mmbase.util.logging.Logging;
0018:
0019: /**
0020: * XMLFields in MMBase. This class can encode such a field to several other formats.
0021: *
0022: * @author Michiel Meeuwissen
0023: * @version $Id: XmlField.java,v 1.51 2007/08/04 08:09:14 michiel Exp $
0024: */
0025:
0026: public class XmlField extends ConfigurableStringTransformer implements
0027: CharTransformer {
0028:
0029: private static final Logger log = Logging
0030: .getLoggerInstance(XmlField.class);
0031:
0032: // can be decoded:
0033: public final static int POORBODY = 5;
0034: public final static int RICHBODY = 6;
0035:
0036: // cannot yet be encoded even..
0037: public final static int HTML_INLINE = 7;
0038: public final static int HTML_BLOCK = 8;
0039: public final static int HTML_BLOCK_BR = 9;
0040: public final static int HTML_BLOCK_NOSURROUNDINGP = 10;
0041: public final static int HTML_BLOCK_BR_NOSURROUNDINGP = 11;
0042: public final static int HTML_BLOCK_LIST = 12;
0043: public final static int HTML_BLOCK_LIST_BR = 13;
0044: public final static int HTML_BLOCK_LIST_NOSURROUNDINGP = 14;
0045: public final static int HTML_BLOCK_LIST_BR_NOSURROUNDINGP = 15;
0046:
0047: // cannot be decoded:
0048: public final static int ASCII = 51;
0049: public final static int XHTML = 52;
0050:
0051: private final static String CODING = "UTF-8"; // This class only support UTF-8 now.
0052:
0053: private static boolean isListChar(char c) {
0054: return c == '-' || c == '*';
0055: }
0056:
0057: private static String listTag(char c) {
0058: return c == '-' ? "ul" : "ol";
0059: }
0060:
0061: /**
0062: * Takes a string object, finds list structures and changes those to XML
0063: */
0064: static void handleList(StringObject obj) {
0065: // handle lists
0066: // make <ul> possible (not yet nested), with -'s on the first char of line.
0067: int inList = 0; //
0068: int pos = 0;
0069: if (obj.length() < 3) {
0070: return;
0071: }
0072: char listChar = '-';
0073: if (isListChar(obj.charAt(0)) && !isListChar(obj.charAt(1))) { // hoo, we even _start_ with a list;
0074: listChar = obj.charAt(0);
0075: obj.insert(0, "\n"); // in the loop \n- is deleted, so it must be there.
0076: } else {
0077: while (true) {
0078: int pos1 = obj.indexOf("\n-", pos); // search the first
0079: int pos2 = obj.indexOf("\n*", pos); // search the first
0080:
0081: pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1
0082: : pos2;
0083: if (pos == -1 || obj.length() <= pos + 2)
0084: break;
0085: if (!isListChar(obj.charAt(pos + 2))) {
0086: listChar = obj.charAt(pos + 1);
0087: break;
0088: }
0089: pos += 2;
0090: }
0091: }
0092:
0093: listwhile: while (pos != -1) {
0094: if (inList == 0) { // not yet in list
0095: inList++; // now we are
0096: obj.delete(pos, 2); // delete \n-
0097: // remove spaces..
0098: while (pos < obj.length() && obj.charAt(pos) == ' ') {
0099: obj.delete(pos, 1);
0100: }
0101: if (pos > 0) {
0102: obj.insert(pos, "\n");
0103: pos += 1;
0104: }
0105: obj.insert(pos, "<" + listTag(listChar) + ">\r<li>"); // insert 9 chars.
0106: pos += 9;
0107:
0108: } else { // already in list
0109: if (obj.charAt(pos + 1) != listChar) { // end of list
0110: obj.delete(pos, 1); // delete \n
0111: obj.insert(pos, "</li>\r</" + listTag(listChar)
0112: + ">\n");
0113: pos += 12;
0114: inList--;
0115: } else { // not yet end
0116: obj.delete(pos, 2); // delete \n-
0117: // remove spaces..
0118: while (pos < obj.length() && obj.charAt(pos) == ' ')
0119: obj.delete(pos, 1);
0120: obj.insert(pos, "</li>\r<li>");
0121: pos += 10;
0122: }
0123: }
0124: if (inList > 0) { // search for new line
0125: pos = obj.indexOf("\n", pos);
0126: if (pos == -1)
0127: break; // no new line found? End of list, of text.
0128: if (pos + 1 == obj.length()) {
0129: obj.delete(pos, 1);
0130: break; // if end of text, simply remove the newline.
0131: }
0132: while (obj.charAt(pos + 1) == ' ') {
0133: // if next line starts with space, this new line does not count. This makes it possible to have some formatting in a <li>
0134: pos = obj.indexOf("\n", pos + 1);
0135: if (pos + 1 == obj.length()) {
0136: obj.delete(pos, 1);
0137: break listwhile; // nothing to do...
0138: }
0139: }
0140: } else { // search for next item
0141: while (true) {
0142: int pos1 = obj.indexOf("\n-", pos);
0143: int pos2 = obj.indexOf("\n*", pos);
0144:
0145: pos = (pos1 > 0 && pos1 < pos2) || pos2 < 0 ? pos1
0146: : pos2;
0147: if (pos == -1 || obj.length() <= pos + 2)
0148: break;
0149: if (!isListChar(obj.charAt(pos + 2))) {
0150: listChar = obj.charAt(pos + 1);
0151: break; // should not start with two -'s, because this is some seperation line
0152: }
0153: pos += 2;
0154: }
0155: }
0156: }
0157: // make sure that the list is closed:
0158: while (inList > 0) { // lists in lists not already supported, but if we will...
0159: obj.insert(obj.length(), "</li></" + listTag(listChar)
0160: + ">\n");
0161: inList--; // always finish with a new line, it might be needed for the finding of paragraphs.
0162: }
0163:
0164: }
0165:
0166: /**
0167: * If you want to add a _ in your text, that should be possible too...
0168: * Should be done last, because no tags can appear in <em>
0169:
0170: * @param ch This is '_' or e.g. '*'
0171: * @param tag The tag to produce, e.g. "em" or "strong"
0172: */
0173: // test cases:
0174: // I cite _m_pos_! -> <mmxf><p>I cite <em>m_pos</em>!</p></mmxf>
0175: static void handleEmph(StringObject obj, char ch, String tag) {
0176:
0177: obj.replace("" + ch + ch, "_"); // makes it possible to escape underscores (or what you choose)
0178:
0179: // Emphasizing. This is perhaps also asking for trouble, because
0180: // people will try to use it like <font> or other evil
0181: // things. But basicly emphasizion is content, isn't it?
0182:
0183: String sch = "" + ch;
0184:
0185: int posEmphOpen = obj.indexOf(sch, 0);
0186: int posTagOpen = obj.indexOf("<", 0); // must be closed before next tag opens.
0187:
0188: OUTER: while (posEmphOpen != -1) {
0189:
0190: if (posTagOpen > 0 && posTagOpen < posEmphOpen) { // ensure that we are not inside existing tags
0191: int posTagClose = obj.indexOf(">", posTagOpen);
0192: if (posTagClose == -1)
0193: break;
0194: posEmphOpen = obj.indexOf(sch, posTagClose);
0195: posTagOpen = obj.indexOf("<", posTagClose);
0196: continue;
0197: }
0198:
0199: if (posEmphOpen + 1 >= obj.length())
0200: break; // no use, nothing can follow
0201:
0202: if ((posEmphOpen > 0 && Character.isLetterOrDigit(obj
0203: .charAt(posEmphOpen - 1)))
0204: || (!Character.isLetterOrDigit(obj
0205: .charAt(posEmphOpen + 1)))) {
0206: // _ is inside a word, ignore that.
0207: // or not starting a word
0208: posEmphOpen = obj.indexOf(sch, posEmphOpen + 1);
0209: continue;
0210: }
0211:
0212: // now find closing _.
0213: int posEmphClose = obj.indexOf(sch, posEmphOpen + 1);
0214: if (posEmphClose == -1)
0215: break;
0216: while ((posEmphClose + 1) < obj.length()
0217: && (Character.isLetterOrDigit(obj
0218: .charAt(posEmphClose + 1)))) {
0219: posEmphClose = obj.indexOf(sch, posEmphClose + 1);
0220: if (posEmphClose == -1)
0221: break OUTER;
0222: }
0223:
0224: if (posTagOpen > 0 && posEmphClose > posTagOpen) {
0225: posEmphOpen = obj.indexOf(sch, posTagOpen); // a tag opened before emphasis close, ignore then too, and re-search
0226: continue;
0227: }
0228:
0229: // realy do replacing now
0230: obj.delete(posEmphClose, 1);
0231: obj.insert(posEmphClose, "</" + tag + ">");
0232: obj.delete(posEmphOpen, 1);
0233: obj.insert(posEmphOpen, "<" + tag + ">");
0234: posEmphClose += 7;
0235:
0236: posEmphOpen = obj.indexOf(sch, posEmphClose);
0237: posTagOpen = obj.indexOf("<", posEmphClose);
0238:
0239: }
0240:
0241: obj.replace("_", sch);
0242: }
0243:
0244: /**
0245: * Some paragraphs are are really \sections. So this handler can
0246: * be done after handleParagraphs. It will search the paragraphs
0247: * which are really headers, and changes them. A header, in our
0248: * 'rich' text format, is a paragraph starting with one or more $.
0249: * If there are more then one, the resulting <section> tags are
0250: * going to be nested.
0251: *
0252: */
0253: static void handleHeaders(StringObject obj) {
0254: // handle headers
0255: int requested_level;
0256: char ch;
0257: int level = 0; // start without being in section.
0258: int pos = obj.indexOf("<p>$", 0);
0259: OUTER: while (pos != -1) {
0260: obj.delete(pos, 4); // remove <p>$
0261:
0262: requested_level = 1;
0263: // find requested level:
0264: while (true) {
0265: ch = obj.charAt(pos);
0266: if (ch == '$') {
0267: requested_level++;
0268: obj.delete(pos, 1);
0269: } else {
0270: if (ch == ' ') {
0271: obj.delete(pos, 1);
0272: }
0273: break;
0274: }
0275: }
0276: StringBuilder add = new StringBuilder();
0277: for (; requested_level <= level; level--) {
0278: // same or higher level section
0279: add.append("</section>");
0280: }
0281: level++;
0282: for (; requested_level > level; level++) {
0283: add.append("<section>");
0284: }
0285: add.append("<section><h>");
0286:
0287: obj.insert(pos, add.toString());
0288: pos += add.length();
0289:
0290: // search end title of header;
0291:
0292: while (true) { // oh yes, and don't allow _ in title.
0293: int pos1 = obj.indexOf("_", pos);
0294: int posP = obj.indexOf("</p>", pos);
0295: int posNl = obj.indexOf("\n", pos);
0296: int delete;
0297: int pos2;
0298: if ((posP > 0 && posP < posNl) || posNl == -1) {
0299: pos2 = posP;
0300: delete = 4;
0301: } else {
0302: pos2 = posNl;
0303: delete = 1;
0304: }
0305: if (pos1 < pos2 && pos1 > 0) {
0306: obj.delete(pos1, 1);
0307: } else {
0308: pos = pos2;
0309: if (pos == -1) {
0310: break OUTER; // not found, could not happen.
0311: }
0312: obj.delete(pos, delete);
0313: obj.insert(pos, "</h>");
0314: pos += 4;
0315: if (delete == 1) {
0316: obj.insert(pos, "<p>");
0317: pos += 3;
0318: }
0319: break;
0320: }
0321: }
0322: pos = obj.indexOf("<p>$", pos); // search the next one.
0323: }
0324: // ready, close all sections still open.
0325: for (; level > 0; level--) {
0326: obj.insert(obj.length(), "</section>");
0327: }
0328:
0329: }
0330:
0331: // check if on that position the string object contains a <ul> or <ol>
0332: static private boolean containsListTag(StringObject obj, int pos) {
0333: return obj.length() > pos + 4
0334: && obj.charAt(pos) == '<'
0335: && (obj.charAt(pos + 1) == 'u' || obj.charAt(pos + 1) == 'o')
0336: && obj.charAt(pos + 2) == 'l'
0337: && obj.charAt(pos + 3) == '>';
0338: }
0339:
0340: /**
0341: * Make <p> </p> tags.
0342: * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
0343: * @param surroundingP (defaults to true) wether the surrounding <p> should be included too.
0344: */
0345: static void handleParagraphs(StringObject obj,
0346: boolean leaveExtraNewLines, boolean surroundingP) {
0347: handleParagraphs(obj, leaveExtraNewLines, surroundingP, false);
0348: }
0349:
0350: /**
0351: * Make <p> </p> tags.
0352: * Note that if placeListsInsideP is <code>false</code>, the code generated with lists becomes akin to:
0353: * <p>...</p><ul>...</ul><p>...</p>
0354: *
0355: * If placeListsInsideP is <code>true</code>, the code becomes:
0356: * <p>...<ul>...</ul>...</p>
0357: *
0358: * If there is no content in front of the first list, or after the last list, those paragraphs are empty and may not be
0359: * added.
0360: *
0361: * @param leaveExtraNewLines (defaults to false) if false, 2 or more newlines starts a new p. If true, every 2 newlines starts new p, and every extra new line simply stays (inside the p).
0362: * @param surroundingP (defaults to true) wether the surrounding <p> should be included too.
0363: * @param placeListsInsideP (defaults to false) wether a list should be placed inside a <p> (as allowed by xhtml2).
0364: */
0365: static void handleParagraphs(StringObject obj,
0366: boolean leaveExtraNewLines, boolean surroundingP,
0367: boolean placeListsInsideP) {
0368:
0369: log.debug(placeListsInsideP ? "placings lists INSIDE"
0370: : "placings lists OUTSIDE");
0371: // handle paragraphs:
0372: boolean inParagraph = true;
0373: int pos = 0;
0374: // we should actually test if the first bit is a list, and if so, skip it
0375: if (surroundingP) {
0376: if (!placeListsInsideP && containsListTag(obj, pos)) {
0377: //note: this does not take into account nested lists
0378: int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)
0379: + "l>", pos + 1);
0380: // only continue this if this is a balanced list
0381: if (posEnd != -1) {
0382: pos = posEnd + 5;
0383: if (obj.length() > pos && obj.charAt(pos) == '\n') {
0384: obj.delete(pos, 1);
0385: }
0386: if (pos >= obj.length()) {
0387: return;
0388: }
0389: }
0390: }
0391: obj.insert(pos, "<p>");
0392: pos += 3;
0393: } else {
0394: // if the code starts with a list, and it should be placed outside a paragraph,
0395: // add a \n to make sure that the list is parsed
0396: if (!placeListsInsideP && containsListTag(obj, pos)) {
0397: obj.insert(pos, "\n");
0398: }
0399: }
0400: boolean start = true;
0401: while (pos < obj.length()) {
0402: // one or more empty lines.
0403: if (start) {
0404: start = false;
0405: pos = obj.indexOf("\n", pos);
0406: } else {
0407: pos = obj.indexOf("\n", pos + 1);
0408: }
0409: if (pos == -1)
0410: break;
0411:
0412: int skip = 1;
0413: int l = obj.length();
0414: while (pos + skip < l
0415: && Character.isWhitespace(obj.charAt(pos + skip))) {
0416: if (obj.charAt(pos + skip) == '\n') {
0417: break;
0418: }
0419: skip++;
0420: }
0421: if (pos + skip >= l)
0422: break;
0423: // we need at least 2 lines for a paragraph.
0424: // however, if we instead have a list now, and we are not placeListsInsideP,
0425: // we should still terminate the paragraph, as the ul then falls outside
0426: // the paragraph.
0427: if (obj.charAt(pos + skip) != '\n') {
0428: if (!containsListTag(obj, pos + skip)) {
0429: continue;
0430: }
0431: obj.delete(pos, skip);
0432: if (placeListsInsideP) {
0433: int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)
0434: + "l>", pos + 1);
0435: if (posEnd != -1) {
0436: pos = posEnd + 5;
0437: if (obj.length() > pos
0438: && obj.charAt(pos) == '\n'
0439: && (obj.length() == pos + 1 || obj
0440: .charAt(pos + 1) != '\n')) {
0441: obj.delete(pos, 1);
0442: }
0443: }
0444: continue;
0445: }
0446: } else {
0447: // delete the 2 new lines of the p.
0448: obj.delete(pos, skip + 1);
0449: }
0450:
0451: if (leaveExtraNewLines) {
0452: while (obj.length() > pos
0453: && Character.isWhitespace(obj.charAt(pos))) {
0454: pos++;
0455: }
0456: } else {
0457: while (obj.length() > pos
0458: && Character.isWhitespace(obj.charAt(pos))) {
0459: obj.delete(pos, 1); // delete the extra new lines too
0460: }
0461: }
0462: if (inParagraph) { // close the previous paragraph.
0463: obj.insert(pos, "</p>");
0464: inParagraph = false;
0465: pos += 4;
0466: }
0467: // initialize skip for leading whitespace
0468: skip = 0;
0469: // if the next code happens to be a list tag (ul/ol), we can do two things:
0470: // - place the list outside the paragraph (if we are not placeListsInsideP).
0471: // In that case, we should not start a new
0472: // paragraph until after the list. Moreover, if we are then at the end of the
0473: // text we should not include a paragraph at all unless it is enforced.
0474: // - include de ul in the paragraph. In that case, we simply continue as normal
0475: if (!placeListsInsideP && obj.length() > pos
0476: && containsListTag(obj, pos)) {
0477: int posEnd = obj.indexOf("</" + obj.charAt(pos + 1)
0478: + "l>", pos + 1);
0479: // only continue this if this is a balanced list
0480: if (posEnd != -1) {
0481: pos = posEnd + 5;
0482: // skip all whitespace after a list.
0483: int newlines = 0;
0484: while (obj.length() > (pos + skip)
0485: && Character.isWhitespace(obj.charAt(pos
0486: + skip))) {
0487: if (obj.charAt(pos + skip) == '\n') {
0488: newlines++;
0489: }
0490: if (newlines > 1 && leaveExtraNewLines) {
0491: skip++; // count whitespace after the second newline,
0492: // to include in the next paragraph
0493: } else {
0494: obj.delete(pos, 1); // delete whitespace
0495: }
0496: }
0497: // if no text follows, and we don't need an extra paragraphs, skip
0498: // note that we always add a <p> if we have the 'ommitsurrounding' option
0499: // - because the option expects this.
0500: if (surroundingP && pos == obj.length()) {
0501: break;
0502: }
0503: }
0504: }
0505: // next paragraph.
0506: obj.insert(pos, "\r<p>");
0507: pos += skip + 4;
0508: inParagraph = true;
0509: }
0510: if (inParagraph) { // in current impl. this is always true
0511:
0512: // read whole text, but stil in paragraph
0513: // if text ends with newline, take it away, because it then means </p> rather then <br />
0514: if (obj.length() > 0) {
0515: if (obj.charAt(obj.length() - 1) == '\n') {
0516: obj.delete(obj.length() - 1, 1);
0517: }
0518: }
0519: if (surroundingP) {
0520: obj.insert(obj.length(), "</p>");
0521: }
0522: }
0523: }
0524:
0525: /**
0526: * Wikipedia syntax for tables. (simplified)
0527: * <pre>
0528: * {|
0529: * | a || b || c
0530: * |-
0531: * | d || e || f
0532: * |}
0533: * </pre>
0534: * or e.g.
0535: * <pre>
0536: * {|-
0537: * |+ caption
0538: * ! A !! B !! C
0539: * |-
0540: * | d
0541: * | e
0542: * | f
0543: * |}
0544: * </pre>
0545: *@since MMBase 1.8
0546: */
0547: static void handleTables(StringObject obj) {
0548: int tables = 0;
0549: int pos = 0;
0550: while (pos != -1) {
0551: // always at beginning of line when here.
0552: int l = obj.length();
0553: if (pos + 2 < l
0554: && (obj.charAt(pos) == '{' && obj.charAt(pos + 1) == '|')) {
0555: int skip = 2;
0556: // allow for starting with {|- as well
0557: if (pos + skip < l && obj.charAt(pos + skip) == '-')
0558: skip++;
0559: // allow some trailing whitespace
0560: while (pos + skip < l
0561: && Character.isWhitespace(obj
0562: .charAt(pos + skip))) {
0563: if (obj.charAt(pos + skip) == '\n') {
0564: break;
0565: }
0566: skip++;
0567: }
0568: if (pos + skip >= l)
0569: break;
0570: if (obj.charAt(pos + skip) != '\n') {
0571: pos = obj.indexOf("\n", pos + skip);
0572: continue;
0573: }
0574: skip++;
0575: log.debug("ok, this is a table!");
0576: // don't use l onwards, length of obj will change
0577:
0578: if (pos > 0 && obj.charAt(pos - 1) == '\n') {
0579: obj.delete(pos - 1, 1);
0580: pos--;
0581: }
0582: if (pos > 0 && obj.charAt(pos - 1) == '\n') {
0583: obj.delete(pos - 1, 1);
0584: pos--;
0585: }
0586: tables++;
0587: obj.delete(pos, skip);
0588: obj.insert(pos, "</p><table>");
0589: pos += 11;
0590: if (obj.charAt(pos) == '|'
0591: && obj.charAt(pos + 1) == '+') {
0592: obj.delete(pos, 2);
0593: obj.insert(pos, "<caption>");
0594: pos += 9;
0595: pos = obj.indexOf("\n", pos);
0596: obj.delete(pos, 1);
0597: obj.insert(pos, "</caption>");
0598: pos += 10;
0599: }
0600: obj.insert(pos, "<tr>");
0601: pos += 4;
0602: }
0603: if (pos >= obj.length())
0604: break;
0605: // always in tr here.
0606: if (tables > 0) {
0607: if (obj.charAt(pos) == '|') {
0608: obj.delete(pos, 1);
0609:
0610: if (pos + 2 < obj.length()
0611: && (obj.charAt(pos) == '-' && obj
0612: .charAt(pos + 1) == '\n')) {
0613: obj.delete(pos, 2);
0614: obj.insert(pos, "</tr><tr>");
0615: pos += 9;
0616: } else if (pos + 1 < obj.length()
0617: && (obj.charAt(pos) == '}' && (pos + 2 == obj
0618: .length() || obj.charAt(pos + 1) == '\n'))) {
0619: obj.delete(pos, 2);
0620: obj.insert(pos, "</tr></table>");
0621: tables--;
0622: pos += 13;
0623: if (tables == 0) {
0624: obj.insert(pos, "<p>");
0625: pos += 3;
0626: }
0627: while (pos < obj.length()
0628: && obj.charAt(pos) == '\n')
0629: obj.delete(pos, 1);
0630: } else if (pos + 3 < obj.length()
0631: && (obj.charAt(pos) == '\n'
0632: && obj.charAt(pos + 1) == '{' && obj
0633: .charAt(pos + 2) == '|')) {
0634: obj.delete(pos, 3);
0635: obj.insert(pos, "<td><table><tr>");
0636: pos += 15;
0637: tables++;
0638: } else {
0639: obj.insert(pos, "<td>");
0640: pos += 4;
0641: int nl = obj.indexOf("\n", pos);
0642: int pipe = obj.indexOf("||", pos);
0643: int end = pipe == -1 || nl < pipe ? nl : pipe;
0644: if (end == -1)
0645: end += obj.length();
0646: pos = end;
0647: obj.delete(pos, 1);
0648: obj.insert(pos, "</td>");
0649: pos += 5;
0650: }
0651: continue;
0652: } else if (obj.charAt(pos) == '!') {
0653: obj.delete(pos, 1);
0654: obj.insert(pos, "<th>");
0655: pos += 4;
0656: int nl = obj.indexOf("\n", pos);
0657: int pipe = obj.indexOf("!!", pos);
0658: int end = pipe == -1 || nl < pipe ? nl : pipe;
0659: if (end == -1)
0660: end += obj.length();
0661: pos = end;
0662: obj.delete(pos, 1);
0663: obj.insert(pos, "</th>");
0664: pos += 5;
0665: continue;
0666: } else {
0667: pos = obj.indexOf("\n", pos) + 1;
0668: if (pos >= obj.length())
0669: break;
0670: // oddd. what to do know?
0671: }
0672: } else { // not in table, ignore find next new line
0673: pos = obj.indexOf("\n", pos) + 1;
0674: if (pos == 0)
0675: break;
0676: if (pos >= obj.length())
0677: break;
0678: }
0679: }
0680: while (tables > 0) {
0681: obj.insert(pos, "</tr></table>");
0682: pos += 13;
0683: tables--;
0684: if (tables == 0) {
0685: obj.insert(pos, "<p>");
0686: pos += 3;
0687: while (pos < obj.length() && obj.charAt(pos) == '\n')
0688: obj.delete(pos, 1);
0689: }
0690: }
0691:
0692: }
0693:
0694: /**
0695: * Removes all new lines and space which are too much.
0696: */
0697: static void cleanupText(StringObject obj) {
0698: // remaining new lines have no meaning.
0699: obj.replace(">\n", ">"); // don't replace by space if it is just after a tag, it could have a meaning then.
0700: obj.replace("\n", " "); // replace by space, because people could use it as word boundary.
0701: // remaining double spaces have no meaning as well:
0702: int pos = obj.indexOf(" ", 0);
0703: while (pos != -1) {
0704: pos++;
0705: while (obj.length() > pos && obj.charAt(pos) == ' ') {
0706: obj.delete(pos, 1);
0707: }
0708: pos = obj.indexOf(" ", pos);
0709: }
0710: // we used \r for non significant newlines:
0711: obj.replace("\r", "");
0712:
0713: }
0714:
0715: /**
0716: * Only escape, clean up.
0717: * @since MMBase-1.7
0718: */
0719: protected static void handleFormat(StringObject obj, boolean format) {
0720: if (format) {
0721: obj.replace("\r", "\n");
0722: } else {
0723: cleanupText(obj);
0724: }
0725:
0726: }
0727:
0728: protected static String prepareDataString(String data) {
0729: return Xml.XMLEscape(data).replaceAll("\r", ""); // drop returns (\r), we work with newlines, \r will be used as a help.
0730: }
0731:
0732: protected static StringObject prepareData(String data) {
0733: return new StringObject(prepareDataString(data));
0734: }
0735:
0736: /**
0737: * Constant for use as argument of {@link #handleRich}
0738: * @since MMBase-1.9
0739: */
0740: protected final static boolean SECTIONS = true;
0741: protected final static boolean NO_SECTIONS = false;
0742: protected final static boolean LEAVE_NEWLINES = true;
0743: protected final static boolean REMOVE_NEWLINES = false;
0744: protected final static boolean SURROUNDING_P = true;
0745: protected final static boolean NO_SURROUNDING_P = false;
0746: protected final static boolean LISTS_INSIDE_P = true;
0747: protected final static boolean LISTS_OUTSIDE_P = false;
0748:
0749: protected static void handleRich(StringObject obj,
0750: boolean sections, boolean leaveExtraNewLines,
0751: boolean surroundingP) {
0752: handleRich(obj, sections, leaveExtraNewLines, surroundingP,
0753: LISTS_OUTSIDE_P);
0754: }
0755:
0756: protected static void handleRich(StringObject obj,
0757: boolean sections, boolean leaveExtraNewLines,
0758: boolean surroundingP, boolean placeListsInsideP) {
0759: // the order _is_ important!
0760: handleList(obj);
0761: handleTables(obj);
0762: handleParagraphs(obj, leaveExtraNewLines, surroundingP,
0763: placeListsInsideP);
0764: if (sections) {
0765: handleHeaders(obj);
0766: }
0767: handleEmph(obj, '_', "em");
0768: handleEmph(obj, '*', "strong");
0769: }
0770:
0771: static void handleNewlines(StringObject obj) {
0772: obj.replace("</ul>\n", "</ul>"); // otherwise we will wind up with the silly "</ul><br />" the \n was necessary for </ul></p>
0773: obj.replace("\n", "<br />\r"); // handle new remaining newlines.
0774: }
0775:
0776: private static Pattern wikiWrappingAnchor = Pattern
0777: .compile("\\[(\\w+):(.*?)\\]");
0778: private static Pattern wikiP = Pattern.compile("<p>\\[(\\w+)\\]");
0779: private static Pattern wikiSection = Pattern
0780: .compile("<section><h>\\[(\\w+)\\]");
0781: private static Pattern wikiAnchor = Pattern.compile("\\[(\\w+)\\]");
0782:
0783: public static String wikiToXML(String data,
0784: boolean placeListsInsideP) {
0785: Matcher wrappingAnchors = wikiWrappingAnchor
0786: .matcher(prepareDataString(data));
0787: data = wrappingAnchors.replaceAll("<a id=\"$1\">$2</a>");
0788: StringObject obj = new StringObject(data);
0789: handleRich(obj, SECTIONS, REMOVE_NEWLINES, SURROUNDING_P,
0790: placeListsInsideP);
0791: handleFormat(obj, false);
0792: String string = obj.toString();
0793: Matcher ps = wikiP.matcher(string);
0794: string = ps.replaceAll("<p id=\"$1\">");
0795: Matcher sections = wikiSection.matcher(string);
0796: string = sections.replaceAll("<section id=\"$1\"><h>");
0797: Matcher anchors = wikiAnchor.matcher(string);
0798: string = anchors.replaceAll("<a id=\"$1\" />");
0799: return string;
0800: }
0801:
0802: public static String wikiToXML(String data) {
0803: return wikiToXML(data, LISTS_OUTSIDE_P);
0804: }
0805:
0806: /**
0807: * Defines a kind of 'rich' text format. This is a way to easily
0808: * type structured text in XML. The XML tags which can be
0809: * produced by this are all HTML as well.
0810: *
0811: * This is a generalisation of the MMBase html() functions which
0812: * does similar duties, but hopefully this one is better, and more
0813: * powerfull too.
0814: *
0815: * The following things are recognized:
0816: * <ul>
0817: * <li> Firstly, XMLEscape is called.</li>
0818: * <li> A line starting with an asterix (*) will start an unnumberd
0819: * list. The first new line not starting with a space or an other
0820: * asterix will end the list </li>
0821: * <li> Underscores are translated to the emphasize HTML-tag</li>
0822: * <li> You can create a header tag by by starting a line with a dollar signs</li>
0823: * <li> A paragraph can be begun (and ended) with an empty line.</li>
0824: * </ul>
0825: *
0826: * Test with commandline: java org.mmbase.util.Encode RICH_TEXT (reads from STDIN)
0827: *
0828: * @param data text to convert
0829: * @param format if the resulting XML must be nicely formatted (default: false)
0830: * @return the converted text
0831: */
0832:
0833: public static String richToXML(String data, boolean format,
0834: boolean placeListsInsideP) {
0835: StringObject obj = prepareData(data);
0836: handleRich(obj, SECTIONS, LEAVE_NEWLINES, SURROUNDING_P,
0837: placeListsInsideP);
0838: handleNewlines(obj);
0839: handleFormat(obj, format);
0840: return obj.toString();
0841: }
0842:
0843: public static String richToXML(String data, boolean format) {
0844: return richToXML(data, format, LISTS_OUTSIDE_P);
0845: }
0846:
0847: public static String richToXML(String data) {
0848: return richToXML(data, false);
0849: }
0850:
0851: /**
0852: * As richToXML but a little less rich. Which means that only one new line is non significant.
0853: * @see #richToXML
0854: */
0855: public static String poorToXML(String data, boolean format,
0856: boolean placeListsInsideP) {
0857: StringObject obj = prepareData(data);
0858: handleRich(obj, SECTIONS, REMOVE_NEWLINES, SURROUNDING_P,
0859: placeListsInsideP);
0860: handleFormat(obj, format);
0861: return obj.toString();
0862: }
0863:
0864: public static String poorToXML(String data, boolean format) {
0865: return poorToXML(data, format, LISTS_OUTSIDE_P);
0866: }
0867:
0868: public static String poorToXML(String data) {
0869: return poorToXML(data, false);
0870: }
0871:
0872: /**
0873: * So poor, that it actually generates pieces of XHTML 1.1 blocks (so, no use of sections).
0874: *
0875: * @see #richToXML
0876: * @since MMBase-1.7
0877: */
0878: public static String richToHTMLBlock(String data,
0879: boolean multipibleBrs, boolean surroundingP,
0880: boolean placeListsInsideP) {
0881: StringObject obj = prepareData(data);
0882:
0883: handleRich(obj, false, multipibleBrs, surroundingP,
0884: placeListsInsideP);
0885: // no <section> tags, leave newlines if multipble br's requested
0886:
0887: handleNewlines(obj);
0888: handleFormat(obj, false);
0889: return obj.toString();
0890: }
0891:
0892: public static String richToHTMLBlock(String data) {
0893: return richToHTMLBlock(data, false, true, true);
0894: }
0895:
0896: public static String richToHTMLBlock(String data,
0897: boolean multipibleBrs, boolean surroundingP) {
0898: return richToHTMLBlock(data, multipibleBrs, surroundingP,
0899: LISTS_OUTSIDE_P);
0900: }
0901:
0902: /**
0903: * So poor, that it actually generates pieces of XHTML 1.1 inlines (so, no use of section, br, p).
0904: *
0905: * @since MMBase-1.7
0906: */
0907: public static String poorToHTMLInline(String data) {
0908: StringObject obj = prepareData(data);
0909: // don't add newlines.
0910: handleFormat(obj, false);
0911: handleEmph(obj, '_', "em");
0912: handleEmph(obj, '*', "strong");
0913: return obj.toString();
0914: }
0915:
0916: /**
0917: * Base function for XSL conversions.
0918: */
0919:
0920: protected static String XSLTransform(String xslFile, String data) {
0921: try {
0922: java.net.URL u = ResourceLoader.getConfigurationRoot()
0923: .getResource("xslt/" + xslFile);
0924: java.io.StringWriter res = new java.io.StringWriter();
0925: XSLTransformer.transform(new StreamSource(new StringReader(
0926: data)), u, new StreamResult(res), null);
0927: return res.toString();
0928: } catch (javax.xml.transform.TransformerException te) {
0929: return te.getMessage();
0930: }
0931: }
0932:
0933: protected static void validate(String incoming)
0934: throws FormatException {
0935: try {
0936: if (log.isDebugEnabled()) {
0937: log.debug("Validating " + incoming);
0938: }
0939: javax.xml.parsers.DocumentBuilderFactory dfactory = javax.xml.parsers.DocumentBuilderFactory
0940: .newInstance();
0941:
0942: // turn validating on..
0943: dfactory.setValidating(true);
0944: dfactory.setNamespaceAware(true);
0945: javax.xml.parsers.DocumentBuilder documentBuilder = dfactory
0946: .newDocumentBuilder();
0947:
0948: // in order to find the dtd.....
0949: org.mmbase.util.XMLEntityResolver resolver = new org.mmbase.util.XMLEntityResolver();
0950: documentBuilder.setEntityResolver(resolver);
0951:
0952: // in order to log our xml-errors
0953: StringBuilder errorBuff = new StringBuilder();
0954: ErrorHandler errorHandler = new ErrorHandler(errorBuff);
0955: documentBuilder.setErrorHandler(errorHandler);
0956: // documentBuilder.init();
0957: java.io.InputStream input = new java.io.ByteArrayInputStream(
0958: incoming.getBytes(CODING));
0959: documentBuilder.parse(input);
0960:
0961: if (!resolver.hasDTD()) {
0962: throw new FormatException(
0963: "no doc-type specified for the xml");
0964: }
0965: if (errorHandler.errorOrWarning) {
0966: throw new FormatException("error in xml: \n"
0967: + errorBuff.toString());
0968: }
0969: } catch (javax.xml.parsers.ParserConfigurationException pce) {
0970: throw new FormatException(
0971: "[sax parser] not well formed xml: "
0972: + pce.toString());
0973: } catch (org.xml.sax.SAXException se) {
0974: log.debug("", se);
0975: //throw new FormatException("[sax] not well formed xml: "+se.toString() + "("+se.getMessage()+")");
0976: } catch (java.io.IOException ioe) {
0977: throw new FormatException("[io] not well formed xml: "
0978: + ioe.toString());
0979: }
0980: }
0981:
0982: protected static class FormatException extends java.lang.Exception {
0983: FormatException(String msg) {
0984: super (msg);
0985: }
0986: }
0987:
0988: // Catch any errors or warnings,....
0989: static class ErrorHandler implements org.xml.sax.ErrorHandler {
0990: boolean errorOrWarning;
0991: StringBuilder errorBuff;
0992:
0993: ErrorHandler(StringBuilder errorBuff) {
0994: super ();
0995: this .errorBuff = errorBuff;
0996: errorOrWarning = false;
0997: }
0998:
0999: // all methods from org.xml.sax.ErrorHandler
1000: // from org.xml.sax.ErrorHandler
1001: public void fatalError(org.xml.sax.SAXParseException exc) {
1002: errorBuff.append("FATAL[" + getLocationString(exc) + "]:"
1003: + exc.getMessage() + "\n");
1004: errorOrWarning = true;
1005: }
1006:
1007: // from org.xml.sax.ErrorHandler
1008: public void error(org.xml.sax.SAXParseException exc) {
1009: errorBuff.append("Error[" + getLocationString(exc) + "]: "
1010: + exc.getMessage() + "\n");
1011: errorOrWarning = true;
1012: }
1013:
1014: // from org.xml.sax.ErrorHandler
1015: public void warning(org.xml.sax.SAXParseException exc) {
1016: errorBuff.append("Warning[" + getLocationString(exc) + "]:"
1017: + exc.getMessage() + "\n");
1018: errorOrWarning = true;
1019: }
1020:
1021: // helper methods
1022: /**
1023: * Returns a string of the location.
1024: */
1025: private String getLocationString(
1026: org.xml.sax.SAXParseException ex) {
1027: StringBuilder str = new StringBuilder();
1028: String systemId = ex.getSystemId();
1029: if (systemId != null) {
1030: int index = systemId.lastIndexOf('/');
1031: if (index != -1) {
1032: systemId = systemId.substring(index + 1);
1033: }
1034: str.append(systemId);
1035: }
1036: str.append(" line:");
1037: str.append(ex.getLineNumber());
1038: str.append(" column:");
1039: str.append(ex.getColumnNumber());
1040: return str.toString();
1041: }
1042: }
1043:
1044: public XmlField() {
1045: super ();
1046: }
1047:
1048: public XmlField(int to) {
1049: super (to);
1050: }
1051:
1052: public Map<String, Config> transformers() {
1053: Map<String, Config> h = new HashMap<String, Config>();
1054: h.put("MMXF_ASCII", new Config(XmlField.class, ASCII,
1055: "Converts xml to ASCII (cannoted be reversed)"));
1056: h.put("MMXF_BODY_RICH", new Config(XmlField.class, RICHBODY,
1057: "XHTML 2 compliant XML."));
1058: h.put("MMXF_BODY_POOR", new Config(XmlField.class, POORBODY,
1059: "XHTML 2 compliant XML, but withough <br/> tags"));
1060: h.put("MMXF_HTML_INLINE", new Config(XmlField.class,
1061: HTML_INLINE, "Decodes only escaping and with <em>"));
1062: h
1063: .put(
1064: "MMXF_HTML_BLOCK",
1065: new Config(XmlField.class, HTML_BLOCK,
1066: "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
1067: h
1068: .put(
1069: "MMXF_HTML_BLOCK_BR",
1070: new Config(XmlField.class, HTML_BLOCK_BR,
1071: "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
1072: h
1073: .put(
1074: "MMXF_HTML_BLOCK_NOSURROUNDINGP",
1075: new Config(XmlField.class,
1076: HTML_BLOCK_NOSURROUNDINGP,
1077: "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>"));
1078: h
1079: .put(
1080: "MMXF_HTML_BLOCK_BR_NOSURROUNDINGP",
1081: new Config(XmlField.class,
1082: HTML_BLOCK_BR_NOSURROUNDINGP,
1083: "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>"));
1084: h
1085: .put(
1086: "MMXF_HTML_BLOCK_LIST",
1087: new Config(
1088: XmlField.class,
1089: HTML_BLOCK_LIST,
1090: "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
1091: h
1092: .put(
1093: "MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP",
1094: new Config(
1095: XmlField.class,
1096: HTML_BLOCK_LIST_NOSURROUNDINGP,
1097: "Decodes only escaping and with <em>, <p>, <br /> (only one) and <ul>, with <ul> inside the <p>"));
1098: h
1099: .put(
1100: "MMXF_HTML_BLOCK_LIST_BR",
1101: new Config(
1102: XmlField.class,
1103: HTML_BLOCK_LIST_BR,
1104: "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
1105: h
1106: .put(
1107: "MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP",
1108: new Config(
1109: XmlField.class,
1110: HTML_BLOCK_LIST_BR_NOSURROUNDINGP,
1111: "Decodes only escaping and with <em>, <p>, <br /> (also multiples) and <ul>, with <ul> inside the <p>"));
1112: h.put("MMXF_XHTML", new Config(XmlField.class, XHTML,
1113: "Converts to piece of XHTML"));
1114: return h;
1115: }
1116:
1117: public String transform(String data) {
1118: switch (to) {
1119: case RICHBODY:
1120: case POORBODY:
1121: throw new UnsupportedOperationException();
1122: // XXXX
1123: // needing richtext xslt here.
1124: //return XSLTransform("mmxf2rich.xslt", XML_TAGSTART + data + XML_TAGEND);
1125: case ASCII:
1126: return XSLTransform("text.xslt", data);
1127: case HTML_BLOCK:
1128: case HTML_BLOCK_BR:
1129: case HTML_INLINE:
1130: throw new UnsupportedOperationException("Cannot transform");
1131: default:
1132: throw new UnknownCodingException(getClass(), to);
1133: }
1134: }
1135:
1136: public String transformBack(String r) {
1137: String result = null;
1138: switch (to) {
1139: case RICHBODY:
1140: result = richToXML(r);
1141: // rich will not be validated... Cannot be used yet!!
1142: break;
1143: case POORBODY:
1144: result = poorToXML(r);
1145: break;
1146: case HTML_BLOCK:
1147: result = richToHTMLBlock(r, false, true, true);
1148: break;
1149: case HTML_BLOCK_BR:
1150: result = richToHTMLBlock(r, true, true, true);
1151: break;
1152: case HTML_BLOCK_NOSURROUNDINGP:
1153: result = richToHTMLBlock(r, false, false, true);
1154: break;
1155: case HTML_BLOCK_BR_NOSURROUNDINGP:
1156: result = richToHTMLBlock(r, true, false, true);
1157: break;
1158:
1159: case HTML_BLOCK_LIST:
1160: result = richToHTMLBlock(r, false, true, false);
1161: break;
1162: case HTML_BLOCK_LIST_BR:
1163: result = richToHTMLBlock(r, true, true, false);
1164: break;
1165: case HTML_BLOCK_LIST_NOSURROUNDINGP:
1166: result = richToHTMLBlock(r, false, false, false);
1167: break;
1168: case HTML_BLOCK_LIST_BR_NOSURROUNDINGP:
1169: result = richToHTMLBlock(r, true, false, false);
1170: break;
1171:
1172: case HTML_INLINE:
1173: result = poorToHTMLInline(r);
1174: break;
1175: case ASCII:
1176: throw new UnsupportedOperationException("Cannot transform");
1177: default:
1178: throw new UnknownCodingException(getClass(), to);
1179: }
1180: return result;
1181: }
1182:
1183: public String getEncoding() {
1184: switch (to) {
1185: case RICHBODY:
1186: return "MMXF_BODY_RICH";
1187: case POORBODY:
1188: return "MMXF_BODY_POOR";
1189: case HTML_BLOCK:
1190: return "MMXF_HTML_BLOCK";
1191: case HTML_BLOCK_BR:
1192: return "MMXF_HTML_BLOCK_BR";
1193: case HTML_BLOCK_NOSURROUNDINGP:
1194: return "MMXF_HTML_BLOCK_NOSURROUNDINGP";
1195: case HTML_BLOCK_BR_NOSURROUNDINGP:
1196: return "MMXF_HTML_BLOCK_BR_NOSURROUNDINGP";
1197: case HTML_BLOCK_LIST:
1198: return "MMXF_HTML_BLOCK_LIST";
1199: case HTML_BLOCK_LIST_BR:
1200: return "MMXF_HTML_BLOCK_LIST_BR";
1201: case HTML_BLOCK_LIST_NOSURROUNDINGP:
1202: return "MMXF_HTML_BLOCK_LIST_NOSURROUNDINGP";
1203: case HTML_BLOCK_LIST_BR_NOSURROUNDINGP:
1204: return "MMXF_HTML_BLOCK_LIST_BR_NOSURROUNDINGP";
1205: case HTML_INLINE:
1206: return "MMXF_HTML_INLINE";
1207: case ASCII:
1208: return "MMXF_ASCII";
1209: default:
1210: throw new UnknownCodingException(getClass(), to);
1211: }
1212: }
1213: }
|