001: /*
002: * Created on Jan 30, 2005
003: *
004: */
005: package net.sf.thingamablog.generator;
006:
007: import java.io.StringReader;
008: import java.io.StringWriter;
009: import java.util.Hashtable;
010: import java.util.List;
011: import java.util.StringTokenizer;
012: import java.util.Vector;
013:
014: import net.sf.thingamablog.xml.Entities;
015:
016: import org.w3c.tidy.Tidy;
017:
018: public class HyperTextTag extends TextTag {
019: public static final String ENCODE_HTML = "encode_html";
020: public static final String STRIP_HTML = "strip_html";
021: public static final String WORDS = "words";
022: public static final String TIDY_HTML = "tidy_html";
023: public static final String CLOSE_EMPTY_TAGS = "close_empty_tags";
024: public static final String EMPTY_TAGS[] = { "br", "hr", "img",
025: "input", "area" };
026: public static final String FIND = "find";
027: public static final String REPLACE = "replace";
028: public static final Object EMPTY = new Object();
029:
030: //undocumented and experimental attributes
031: public static final String ESCAPE = "escape";
032: public static final String UNESCAPE = "unescape";
033: //escape types
034: private static final Hashtable ENTITIES = new Hashtable();
035: static {
036: ENTITIES.put("xml", Entities.XML);
037: ENTITIES.put("html_basic", Entities.HTML_BASIC);
038: ENTITIES.put("html32", Entities.HTML32);
039: ENTITIES.put("html40", Entities.HTML40);
040: ENTITIES.put("html40_full", Entities.HTML40_FULL);
041: }
042:
043: public HyperTextTag(String name) {
044: super (name);
045: Hashtable ht = getDefaultAttributes();
046: ht.put(ENCODE_HTML, "0");
047: ht.put(STRIP_HTML, "0");
048: ht.put(WORDS, "0");
049: ht.put(TIDY_HTML, "0");
050: ht.put(CLOSE_EMPTY_TAGS, "0");
051: ht.put(FIND, "");
052: ht.put(REPLACE, EMPTY);
053:
054: ht.put(ESCAPE, "0");
055: ht.put(UNESCAPE, "0");
056: }
057:
058: private String findReplace(String text, Hashtable attrs) {
059: Object rep = attrs.get(REPLACE);
060: if (!attrs.get(FIND).toString().equals("") && rep != EMPTY) {
061: String regex = attrs.get(FIND).toString().replaceAll(
062: "\\"", "\\\"");
063: String replace = rep.toString().replaceAll("\\"",
064: "\\\"");
065:
066: List regexs = tokenizeFindReplaceValues(regex);
067: List reps = tokenizeFindReplaceValues(replace);
068: for (int i = 0; i < regexs.size(); i++) {
069: String re = regexs.get(i).toString();
070: String rp = "";
071: try {
072: rp = reps.get(i).toString();
073: } catch (IndexOutOfBoundsException ex) {
074: }
075: //System.out.println(re + "-" + rp);
076: try {
077: text = text.replaceAll(re, rp);
078: } catch (Exception ex) {
079: }
080: }
081: }
082:
083: return text;
084: }
085:
086: private List tokenizeFindReplaceValues(String val) {
087: String delim = ",";
088: int pos = 0;
089: Vector tokens = new Vector();
090:
091: while (pos != -1) {
092: int npos = val.indexOf(delim, pos);
093: while (npos > 0 && val.charAt(npos - 1) == '\\')
094: npos = val.indexOf(delim, npos + delim.length());
095:
096: String tok;
097: if (npos == -1)
098: tok = val.substring(pos, val.length());
099: else {
100: tok = val.substring(pos, npos).trim();
101: npos += delim.length();
102: }
103:
104: tokens.add(tok.trim());
105: pos = npos;
106: }
107:
108: return tokens;
109: }
110:
111: public String process(Object obj, Hashtable attribs) {
112: String text = obj.toString();
113:
114: text = findReplace(text, attribs);
115:
116: int len = 0;
117: try {
118: len = Integer.parseInt(attribs.get(WORDS).toString());
119: } catch (Exception ex) {
120: }
121:
122: if (attribs.get(CLOSE_EMPTY_TAGS).toString().equals("1"))
123: for (int i = 0; i < EMPTY_TAGS.length; i++)
124: text = closeEmptyTags(text, EMPTY_TAGS[i]);
125:
126: if (len > 0)
127: text = tidyHTML(limitWords(text, len));
128:
129: if (attribs.get(TIDY_HTML).toString().equals("1"))
130: text = tidyHTML(text);
131:
132: if (attribs.get(STRIP_HTML).toString().equals("1"))
133: text = tidyHTML(text).replaceAll("\\<.*?\\>", "");
134:
135: if (attribs.get(ENCODE_HTML).toString().equals("1"))
136: text = encodeHTML(text);
137:
138: if (!attribs.get(ESCAPE).toString().equals("0")) {
139: Entities e = getEntityMap(attribs.get(ESCAPE).toString()
140: .trim());
141: text = e.escape(text);
142: System.out.println("\nESCAPED...\n" + text);
143: }
144:
145: if (!attribs.get(UNESCAPE).toString().equals("0")) {
146: String atr = attribs.get(UNESCAPE).toString().trim();
147: Entities e = getEntityMap(atr);
148: if (atr.equals("1"))
149: text = e.unescape(text);
150: else {
151: text = e.unescapeUnknownEntities(text);
152: }
153:
154: System.out.println("\nUNESCAPED...\n" + text);
155: }
156:
157: return text;
158: }
159:
160: private Entities getEntityMap(String type) {
161: Entities ents = (Entities) ENTITIES.get(type);
162: if (ents == null)
163: ents = Entities.HTML40_FULL;
164:
165: return ents;
166: }
167:
168: private String tidyHTML(String html) {
169:
170: Tidy tidy = new Tidy();
171: tidy.setXHTML(true);
172: tidy.setQuiet(true);
173: tidy.setShowWarnings(false);
174: System.err.println(tidy.getOutputEncoding());
175: tidy.setOutputEncoding("UTF-8");
176:
177: StringReader reader = new StringReader(html);
178: StringWriter writer = new StringWriter();
179: tidy.parse(reader, writer);
180:
181: html = writer.toString();
182: String bodyStart = "<body>";
183: String bodyEnd = "</body>";
184:
185: int s = html.indexOf(bodyStart);
186: int e = html.lastIndexOf(bodyEnd);
187: if (s != -1 && e != -1) {
188: html = html.substring(s + bodyStart.length(), e);
189: }
190:
191: //System.out.println("\n\nTIDY....\n" + html);
192:
193: //tidy.
194:
195: /*StringReader reader = new StringReader(html);
196: StringWriter writer = new StringWriter();
197:
198: XMLWriter xmlw = new XMLWriter(writer);
199: xmlw.setHTMLMode(true);
200:
201: XMLReader r = new org.ccil.cowan.tagsoup.Parser();
202:
203: try
204: {
205: r.setFeature(Parser.namespacesFeature, false);
206: r.setFeature(Parser.namespacePrefixesFeature, false);
207: r.setContentHandler(xmlw);
208: r.setProperty(Parser.lexicalHandlerProperty, xmlw);
209:
210: InputSource isrc = new InputSource(reader);
211: r.parse(isrc);
212:
213: html = writer.toString();
214: String bodyStart = "<body>";
215: String bodyEnd = "</body>";
216:
217: int s = html.indexOf(bodyStart);
218: int e = html.lastIndexOf(bodyEnd);
219: if(s != -1 && e != -1)
220: {
221: html = html.substring(s + bodyStart.length(), e);
222: }
223: }
224: catch(Exception ex){}
225: System.out.println("\n\nTIDY....\n" + html);*/
226: return html;
227: }
228:
229: private String closeEmptyTags(String html, String tagName) {
230: if (!tagName.startsWith("<"))
231: tagName = "<" + tagName;
232:
233: StringBuffer text = new StringBuffer(html);
234:
235: int p = 0;
236: while ((p = text.indexOf(tagName, p)) != -1) {
237: int end = text.indexOf(">", p);
238: if (end == -1)
239: break;
240: String tag = text.substring(p, end + 1);
241: p = end;
242:
243: if (tag.indexOf("\n") == -1 && text.charAt(end - 1) != '/') {
244: text.insert(end, " /");
245: p += 2;
246: }
247: }
248:
249: return text.toString();
250: }
251:
252: private String encodeHTML(String string) {
253: //return Entities.HTML40.escape(string, false);
254:
255: StringBuffer sb = new StringBuffer(string.length());
256: // true if last char was blank
257: //boolean lastWasBlankChar = false;
258: int len = string.length();
259: char c;
260:
261: for (int i = 0; i < len; i++) {
262: c = string.charAt(i);
263: if (c == ' ') {
264: // blank gets extra work,
265: // this solves the problem you get if you replace all
266: // blanks with , if you do that you lose
267: // word breaking
268:
269: //if(lastWasBlankChar)
270: //{
271: // lastWasBlankChar = false;
272: // sb.append("&nbsp;");
273: //}
274: //else
275: //{
276: // lastWasBlankChar = true;
277: sb.append(' ');
278: //}
279: } else {
280: //lastWasBlankChar = false;
281: //
282: // HTML Special Chars
283: if (c == '"')
284: sb.append(""");
285: else if (c == '&')
286: sb.append("&");
287: else if (c == '<')
288: sb.append("<");
289: else if (c == '>')
290: sb.append(">");
291: //else if (c == '\n')
292: // Handle Newline
293: // sb.append("<br/>");
294: else {
295: //int ci = 0xffff & c;
296: //if (ci < 160)
297: // nothing special only 7 Bit
298: sb.append(c);
299: //else
300: //{
301: // Not 7 Bit use the unicode system
302: //sb.append("&#");
303: //sb.append(new Integer(ci).toString());
304: //sb.append(';');
305: //}
306: }
307: }
308: }
309: return sb.toString();
310: }
311:
312: /*
313: // added by John Montgomery - strips HTML tags from entries
314: private String stripHTML(String s)
315: {
316: if (s.indexOf('<') < 0 && s.indexOf('>') < 0 && s.indexOf('&') < 0)
317: return s;
318:
319: StringBuffer buffer = new StringBuffer(s.length());
320:
321: int index = -1;
322: while ((index = s.indexOf('<')) != -1)
323: {
324: String head = s.substring(0, index);
325: String tail = s.substring(index);
326: buffer.append(head);
327: index = tail.indexOf('>');
328: if (index != -1) // if it's -1 we're partway thru a tag
329: {
330: tail = tail.substring(index + 1);
331: }
332: else
333: {
334: //convert broken tags or trailing '<' to <
335: //so we don't get stuck in an infinte loop
336: tail = encodeHTML(tail);
337: }
338: s = tail;
339: }
340: buffer.append(s); // add what ever is left over
341: return encodeHTML(buffer.toString().trim());
342: }
343: */
344:
345: private String limitWords(String text, int n) {
346: StringTokenizer st = new StringTokenizer(text);
347: int count = 0;
348: String words = "";
349:
350: while (st.hasMoreTokens() && count <= n) {
351: words += st.nextToken();
352: if (count < n)
353: words += ' ';
354: count++;
355: }
356:
357: if (st.hasMoreTokens())
358: words += "...";
359: return words;
360: }
361:
362: }
|