001: /*
002: JSPWiki - a JSP-based WikiWiki clone.
003:
004: Copyright (C) 2001-2006 Janne Jalkanen (Janne.Jalkanen@iki.fi)
005:
006: This program is free software; you can redistribute it and/or modify
007: it under the terms of the GNU Lesser General Public License as published by
008: the Free Software Foundation; either version 2.1 of the License, or
009: (at your option) any later version.
010:
011: This program is distributed in the hope that it will be useful,
012: but WITHOUT ANY WARRANTY; without even the implied warranty of
013: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
014: GNU Lesser General Public License for more details.
015:
016: You should have received a copy of the GNU Lesser General Public License
017: along with this program; if not, write to the Free Software
018: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
019: */
020:
021: package com.ecyrd.jspwiki.parser;
022:
023: import java.util.*;
024:
025: import org.apache.log4j.Logger;
026: import org.jdom.Attribute;
027:
028: /**
029: * Parses JSPWiki-style "augmented" link markup into a Link object
030: * containing the link text, link reference, and any optional link
031: * attributes (as JDOM Attributes).
032: * <p>
033: * The parser recognizes three link forms:
034: * </p>
035: * <ol>
036: * <li><tt> [Text] </tt></li>
037: * <li><tt> [Text | Link] </tt></li>
038: * <li><tt> [Text | Link | attributes] </tt></li>
039: * </ol>
040: * <p>
041: * where the attributes are space-delimited, each in the form of
042: * </p>
043: * <pre>
044: * name1='value1' name2='value2' name3='value3' (etc.) </pre>
045: * <p>
046: * If the attribute parsing fails, the parser will still return the
047: * basic link, writing a warning to the log.
048: * </p>
049: *
050: * <h3>Permitted Attributes</h3>
051: * <p>
052: * Attributes that aren't declared on <tt><a></tt> or those that
053: * permit scripting in HTML (as this is a security risk) are ignored
054: * and have no effect on parsing, nor show up in the resulting attribute
055: * list). The 'href' and 'name' attributes are also ignored as spurious.
056: * The permitted list is: 'accesskey', 'charset', 'class', 'hreflang',
057: * 'id', 'lang', 'dir', 'rel', 'rev', 'style' , 'tabindex', 'target' ,
058: * 'title', and 'type'. The declared attributes that will be ignored
059: * are: 'href', 'name', 'shape', 'coords', 'onfocus', 'onblur', or any
060: * of the other 'on*' event attributes.
061: * </p>
062: * <p>
063: * The permitted attributes and target attribute values are static
064: * String arrays ({@link #PERMITTED_ATTRIBUTES} and
065: * {@link #PERMITTED_TARGET_VALUES} resp.) that could be compile-time
066: * modified (i.e., predeclared).
067: * </p>
068: *
069: * <h3>Permitted Values on Target Attribute</h3>
070: * <p>
071: * The following target names are reserved in HTML 4 and have special
072: * meanings. These are the only values permitted by the parser.
073: * <dl>
074: * <dt><b>_blank</b></dt>
075: * <dd> The user agent should load the designated document in a new,
076: * unnamed window. </dd>
077: * <dt><b>_self</b></dt>
078: * <dd> The user agent should load the document in the same frame as
079: * the element that refers to this target. </dd>
080: * <dt><b>_parent</b></dt>
081: * <dd> The user agent should load the document into the immediate
082: * FRAMESET parent of the current frame. This value is equivalent to
083: * _self if the current frame has no parent. </dd>
084: * <dt><b>_top</b></dt>
085: * <dd> The user agent should load the document into the full,
086: * original window (thus canceling all other frames). This value is
087: * equivalent to _self if the current frame has no parent. </dd>
088: * </dl>
089: *
090: * <h3>Returned Value</h3>
091: * <p>
092: * This returns a <b>Link</b> object, a public inner class with methods:
093: * <ul>
094: * <li> <tt>getText()</tt> returns the link text. </li>
095: * <li> <tt>getReference()</tt> returns the link reference value. </li>
096: * <li> <tt>attributeCount()</tt> returns the number of declared attributes. </li>
097: * <li> <tt>getAttributes()</tt> returns an iterator over any validated
098: * XHTML-compliant attributes, returned as JDOM Attributes.
099: * </li>
100: * </ul>
101: * <p>
102: * The <tt>attributeCount()</tt> method can be used to circumvent calling
103: * <tt>getAttributes()</tt>, which will create an empty Iterator rather
104: * than return a null.
105: * </p>
106: *
107: * <h3>Example: Link Form 1</h3>
108: * <p>
109: * From an incoming wikitext link of:
110: * <pre>
111: * [Acme] </pre>
112: * returns:
113: * <pre>
114: * getText(): "Acme"
115: * getReference(): "Acme"
116: * attributeCount(): 0
117: * getAttributes(): an empty Iterator </pre>
118: *
119: * <h3>Example: Link Form 2</h3>
120: * <p>
121: * From an incoming wikitext link of:
122: * <pre>
123: * [Acme | http://www.acme.com/] </pre>
124: * returns:
125: * <pre>
126: * getText(): "Acme"
127: * getReference(): "http://www.acme.com/"
128: * attributeCount(): 0
129: * getAttributes(): an empty Iterator </pre>
130: *
131: * <h3>Example: Link Form 3</h3>
132: * <p>
133: * From an incoming wikitext link of:
134: * </p>
135: * <pre>
136: * [Acme | http://www.acme.com/ | id='foo' rel='Next'] </pre>
137: * returns:
138: * <pre>
139: * getText(): "Acme"
140: * getReference(): "http://www.acme.com/"
141: * attributeCount(): 2
142: * getAttributes(): an Iterator containing:
143: * JDOM Attribute: id="foo"
144: * JDOM Attribute: rel="Next" </pre>
145: *
146: *
147: * @author Murray Altheim
148: * @since 2.5.10
149: */
150: public class LinkParser {
151: private static Logger log = Logger.getLogger(LinkParser.class);
152:
153: /** Permitted attributes on links. Keep this sorted. */
154: private static final String[] PERMITTED_ATTRIBUTES = new String[] {
155: "accesskey", "charset", "class", "dir", "hreflang", "id",
156: "lang", "rel", "rev", "style", "tabindex", "target",
157: "title", "type" };
158:
159: /** Permitted values on the 'target' attribute. */
160: private static final String[] PERMITTED_TARGET_VALUES = new String[] {
161: "_blank", "_self", "_parent", "_top" };
162:
163: private static final String EQSQUO = "='";
164: private static final String SQUO = "'";
165: private static final String EQ = "=";
166: private static final String TARGET = "target";
167: private static final String DELIMS = " \t\n\r\f=";
168:
169: private static final List m_EMPTY = new ArrayList();
170:
171: // ............
172:
173: /**
174: * Processes incoming link text, separating out the link text, the link
175: * URI, and then any specified attributes.
176: *
177: * @param linktext the wiki link text to be parsed
178: * @return a Link object containing the link text, reference, and any valid Attributes
179: * @throws ParseException if the parameter is null
180: */
181: public Link parse(String linktext) throws ParseException {
182: if (linktext == null) {
183: throw new ParseException("null value passed to link parser");
184: }
185:
186: Link link = null;
187:
188: try {
189: // establish link text and link ref
190: int cut1 = linktext.indexOf('|');
191: if (cut1 == -1) {
192: // link form 1: [Acme]
193: return new Link(linktext);
194: }
195:
196: int cut2 = cut1 + 1 < linktext.length() ? linktext.indexOf(
197: '|', cut1 + 1) : -1;
198:
199: if (cut2 == -1) {
200: // link form 2: [Acme | http://www.acme.com/]
201: String text = linktext.substring(0, cut1).trim(); // to cut1
202: String ref = linktext.substring(cut1 + 1).trim(); // cut1 to end
203: return new Link(text, ref);
204: }
205:
206: // otherwise: link form 3: [Acme | http://www.acme.com/ | id='foo' rel='Next']
207: String text = linktext.substring(0, cut1).trim(); // to cut1
208: String ref = linktext.substring(cut1 + 1, cut2).trim(); // cut1 to cut2
209: String attribs = linktext.substring(cut2 + 1).trim(); // cut2 to end
210:
211: link = new Link(text, ref);
212:
213: // parse attributes
214: if (attribs.indexOf(EQSQUO) != -1) // contains "='" that looks like attrib spec
215: {
216: try {
217: StringTokenizer tok = new StringTokenizer(attribs,
218: DELIMS, true);
219: while (tok.hasMoreTokens()) {
220: String token = tok.nextToken(DELIMS).trim(); // get attribute name token
221: while (isSpace(token) && tok.hasMoreTokens()) {
222: token = tok.nextToken(DELIMS).trim(); // eat any WS
223: }
224:
225: require(tok, EQ); // eat '=', break after '='
226: require(tok, SQUO); // eat opening delim
227: String value = tok.nextToken(SQUO); // using existing delim
228: require(tok, SQUO); // eat closing delim
229:
230: if (token != null && value != null) {
231: if (Arrays.binarySearch(
232: PERMITTED_ATTRIBUTES, token) >= 0) {
233: if (!token.equals(TARGET) // _blank _self _parent _top
234: || Arrays
235: .binarySearch(
236: PERMITTED_TARGET_VALUES,
237: value) >= 0) {
238: Attribute a = new Attribute(token,
239: value);
240: link.addAttribute(a);
241: } else {
242: throw new ParseException(
243: "unknown target attribute value='"
244: + value
245: + "' on link");
246: }
247: } else {
248: throw new ParseException(
249: "unknown attribute name '"
250: + token + "' on link");
251: }
252: } else {
253: throw new ParseException(
254: "unable to parse link attributes '"
255: + attribs + "'");
256:
257: }
258: }
259: } catch (ParseException pe) {
260: log.warn("syntax error parsing link attributes '"
261: + attribs + "': " + pe.getMessage());
262: } catch (NoSuchElementException nse) {
263: log
264: .warn("expected more tokens while parsing link attributes '"
265: + attribs + "'");
266: }
267: }
268:
269: } catch (Exception e) {
270: log.warn(e.getClass().getName()
271: + " thrown by link parser: " + e.getMessage());
272: }
273:
274: return link;
275: }
276:
277: private String require(StringTokenizer tok, String required)
278: throws ParseException, NoSuchElementException {
279: String s = tok.nextToken(required);
280: if (!s.equals(required)) {
281: throw new ParseException("expected '" + required
282: + "' not '" + s + "'"); // I18N
283: }
284: return s;
285: }
286:
287: /**
288: * Returns true if the String <tt>s</tt> is completely
289: * composed of whitespace.
290: *
291: * @param s The string to check
292: * @return True, if "s" is all XML whitespace.
293: */
294: public static final boolean isSpace(String s) {
295: for (int i = 0; i < s.length(); i++) {
296: if (!isSpace(s.charAt(i)))
297: return false;
298: }
299: return true;
300: }
301:
302: /**
303: * Returns true if char <tt>c</tt> is a member of
304: * <tt>S</tt> (space) [XML 1.1 production 3].
305: *
306: * @param c Character to check.
307: * @return True, if the character is an XML space.
308: */
309: public static final boolean isSpace(char c) {
310: return 0x20 == c // SPACE
311: || 0x0A == c // LF
312: || 0x0D == c // CR
313: || 0x09 == c // TAB
314: || 0x85 == c // NEL
315: || 0x2028 == c; // LS (line separator)
316: }
317:
318: // .........................................................................
319:
320: /**
321: * Inner class serving as a struct containing the parsed
322: * components of a link.
323: */
324: public static class Link {
325: private String m_text;
326: private String m_ref = null;
327: private int m_interwikiPoint = -1;
328: private List m_attribs = null;
329:
330: /**
331: * Create a new Link with text but no reference.
332: * @param text The link text.
333: * @throws ParseException If the link text is illegal.
334: */
335: protected Link(String text) throws ParseException {
336: setText(text);
337: }
338:
339: /**
340: * Create a new link with a given text and hyperlink (reference).
341: *
342: * @param text The link text.
343: * @param ref The hypertext reference.
344: * @throws ParseException If the link text or reference are illegal.
345: */
346: protected Link(String text, String ref) throws ParseException {
347: setText(text);
348: setReference(ref);
349: }
350:
351: /**
352: * Sets the link text.
353: *
354: * @param text The link text.
355: * @throws ParseException If the text is illegal (e.g. null).
356: */
357: protected void setText(String text) throws ParseException {
358: if (text == null) {
359: throw new ParseException("null link text");
360: }
361: m_text = text;
362: }
363:
364: /**
365: * Returns the link text.
366: *
367: * @return Link text.
368: */
369: public String getText() {
370: return m_text;
371: }
372:
373: /**
374: * Sets the hypertext reference. Typically, this is an URI or an interwiki link,
375: * or a wikilink.
376: *
377: * @param ref The reference.
378: * @throws ParseException If the reference is illegal.
379: */
380: protected void setReference(String ref) throws ParseException {
381: if (ref == null) {
382: throw new ParseException("null link reference value");
383: }
384: m_ref = ref;
385: }
386:
387: /**
388: * Returns true, if there is a reference.
389: *
390: * @return True, if there's a reference; false otherwise.
391: */
392: public boolean hasReference() {
393: return m_ref != null;
394: }
395:
396: /**
397: * Returns the link reference, or the link text if null.
398: *
399: * @return A link reference.
400: */
401: public String getReference() {
402: return m_ref != null ? m_ref : m_text;
403: }
404:
405: /**
406: * Returns true, if this Link represents an InterWiki link (of the form wiki:page).
407: *
408: * @return True, if this Link represents an InterWiki link.
409: */
410: public boolean isInterwikiLink() {
411: if (!hasReference())
412: m_ref = m_text;
413:
414: m_interwikiPoint = m_ref.indexOf(':');
415:
416: return m_interwikiPoint != -1;
417: }
418:
419: /**
420: * Returns the name of the wiki if this is an interwiki link.
421: * <pre>
422: * Link link = new Link("Foo","Wikipedia:Foobar");
423: * assert( link.getExternalWikiPage(), "Wikipedia" );
424: * </pre>
425: *
426: * @return Name of the wiki, or null, if this is not an interwiki link.
427: */
428: public String getExternalWiki() {
429: if (isInterwikiLink()) {
430: return m_ref.substring(0, m_interwikiPoint);
431: }
432:
433: return null;
434: }
435:
436: /**
437: * Returns the wikiname part of an interwiki link. Used only with interwiki links.
438: * <pre>
439: * Link link = new Link("Foo","Wikipedia:Foobar");
440: * assert( link.getExternalWikiPage(), "Foobar" );
441: * </pre>
442: *
443: * @return Wikiname part, or null, if this is not an interwiki link.
444: */
445: public String getExternalWikiPage() {
446: if (isInterwikiLink()) {
447: return m_ref.substring(m_interwikiPoint + 1);
448: }
449:
450: return null;
451: }
452:
453: /**
454: * Returns the number of attributes on this link.
455: *
456: * @return The number of attributes.
457: */
458: public int attributeCount() {
459: return m_attribs != null ? m_attribs.size() : 0;
460: }
461:
462: /**
463: * Adds another attribute to the link.
464: *
465: * @param attr A JDOM Attribute.
466: */
467: public void addAttribute(Attribute attr) {
468: if (m_attribs == null) {
469: m_attribs = new ArrayList();
470: }
471: m_attribs.add(attr);
472: }
473:
474: /**
475: * Returns an Iterator over the list of JDOM Attributes.
476: *
477: * @return Iterator over the attributes.
478: */
479: public Iterator getAttributes() {
480: return m_attribs != null ? m_attribs.iterator() : m_EMPTY
481: .iterator();
482: }
483:
484: /**
485: * Returns a wikitext string representation of this Link.
486: * @return WikiText.
487: */
488: public String toString() {
489: StringBuffer sb = new StringBuffer();
490: sb.append('[');
491: sb.append(m_text);
492:
493: if (m_ref != null) {
494: sb.append(' ');
495: sb.append('|');
496: sb.append(' ');
497: sb.append(m_ref);
498: }
499:
500: if (m_attribs != null) {
501: sb.append(' ');
502: sb.append('|');
503: Iterator it = getAttributes();
504: while (it.hasNext()) {
505: Attribute a = (Attribute) it.next();
506: sb.append(' ');
507: sb.append(a.getName());
508: sb.append('=');
509: sb.append('\'');
510: sb.append(a.getValue());
511: sb.append('\'');
512: }
513: }
514: sb.append(']');
515: return sb.toString();
516: }
517:
518: } // end inner class
519:
520: }
|