001: // htmlTools.java
002: // -----------------------
003: // (C) by Michael Peter Christen; mc@anomic.de,
004: // (C) by Jan Sandbrink (NN), Franz Brausse (FB, karlchenofhell),
005: // (C) by Bjoern 'fuchs' Krombholz (fuchsi)
006: // first published on http://www.yacy.net
007: //
008: // $LastChangedDate: $
009: // $LastChangedRevision: $
010: // $LastChangedBy: $
011: //
012: // This program is free software; you can redistribute it and/or modify
013: // it under the terms of the GNU General Public License as published by
014: // the Free Software Foundation; either version 2 of the License, or
015: // (at your option) any later version.
016: //
017: // This program is distributed in the hope that it will be useful,
018: // but WITHOUT ANY WARRANTY; without even the implied warranty of
019: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
020: // GNU General Public License for more details.
021: //
022: // You should have received a copy of the GNU General Public License
023: // along with this program; if not, write to the Free Software
024: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
025: //
026: // Using this software in any meaning (reading, learning, copying, compiling,
027: // running) means that you agree that the Author(s) is (are) not responsible
028: // for cost, loss of data or any harm that may be caused directly or indirectly
029: // by usage of this softare or this documentation. The usage of this software
030: // is on your own risk. The installation and usage (starting/running) of this
031: // software may allow other people or application to access your computer and
032: // any attached devices and is highly dependent on the configuration of the
033: // software which must be done by the user of the software; the author(s) is
034: // (are) also not responsible for proper configuration and usage of the
035: // software, even if provoked by documentation provided together with
036: // the software.
037: //
038: // Any changes to this file according to the GPL as documented in the file
039: // gpl.txt aside this file in the shipment you received can be done to the
040: // lines that follows this copyright notice here, but changes must not be
041: // done inside the copyright notive above. A re-distribution must contain
042: // the intact and unchanged copyright notice.
043: // Contributions and changes to the program code must be marked as such.
044: package de.anomic.data;
045:
046: public class htmlTools {
047:
048: /** Replaces characters in a string with other entities according to HTML standards.
049: * @param text a string that possibly contains special characters
050: * @param includingAmpersand if <code>false</code> ampersands are not encoded
051: * @param forXML if <code>true</code> then only &, ", < and > will
052: * be transcoded.
053: * @return the string with all characters replaced by the corresponding character from array
054: */
055: public static String encodeUnicode2html(String text,
056: boolean includingAmpersand, boolean forXML) {
057: if (text == null)
058: return null;
059:
060: int spos = (includingAmpersand ? 0 : 2);
061: // if (forXML), then only encode ampersand, quotation mark, less than and
062: // greather than which are the first 4 pairs in default mapping table
063: int epos = (forXML ? 8 : mapping.length);
064:
065: return encode(text, mapping, spos, epos);
066: }
067:
068: /**
069: * Like {@link #encodeUnicode2html(String, boolean, boolean)} with <code>forXML = false</code>
070: */
071: public static String encodeUnicode2html(String text,
072: boolean includingAmpersand) {
073: return encodeUnicode2html(text, includingAmpersand, false);
074: }
075:
076: /**
077: * Replaces special entities ampersand, quotation marks, and less than/graiter than
078: * by the escaping entities allowed in XML documents.
079: *
080: * Like {@link #encodeUnicode2html(String, boolean, boolean)} with
081: * <code>includingAmpersand = true</code> and <code>foxXML = true</code>.
082: *
083: * @param text the original String
084: * @return the encoded String
085: */
086: public static String encodeUnicode2xml(String text) {
087: return encodeUnicode2html(text, true, true);
088: }
089:
090: /**
091: * Generic method that replaces occurences of special character entities defined in map
092: * array with their corresponding mapping.
093: * @param text The String too process.
094: * @param map An array defining the entity mapping.
095: * @param spos It is possible to use a subset of the map only. This parameter defines the
096: * starting point in the map array.
097: * @param epos The ending point, see above.
098: * @return A copy of the original String with all entities defined in map replaced.
099: */
100: public static String encode(String text, final String[] map,
101: int spos, int epos) {
102: StringBuffer sb = new StringBuffer(text.length());
103: int textpos = 0;
104: search: while (textpos < text.length()) {
105: // find a (forward) mapping
106: loop: for (int i = spos; i < epos; i += 2) {
107: if (text.charAt(textpos) != map[i].charAt(0))
108: continue loop;
109: // found match
110: sb.append(map[i + 1]);
111: textpos++;
112: continue search;
113: }
114: // not found match
115: sb.append(text.charAt(textpos));
116: textpos++;
117: }
118:
119: return sb.toString();
120: }
121:
122: public static String decodeHtml2Unicode(String text) {
123: if (text == null)
124: return null;
125: int pos = 0;
126: StringBuffer sb = new StringBuffer(text.length());
127: search: while (pos < text.length()) {
128: // find a reverse mapping. TODO: replace matching with hashtable(s)
129: loop: for (int i = 0; i < mapping.length; i += 2) {
130: if (pos + mapping[i + 1].length() > text.length())
131: continue loop;
132: for (int j = mapping[i + 1].length() - 1; j >= 0; j--) {
133: if (text.charAt(pos + j) != mapping[i + 1]
134: .charAt(j))
135: continue loop;
136: }
137: // found match
138: sb.append(mapping[i]);
139: pos = pos + mapping[i + 1].length();
140: continue search;
141: }
142: // not found match
143: sb.append(text.charAt(pos));
144: pos++;
145: }
146: return new String(sb);
147: }
148:
149: //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details)
150: //that will be replaced. To add new codes or patterns, just put them at the end
151: //of the list. Codes or patterns in this list can not be escaped with [= or <pre>
152: private static final String[] mapping = {
153: // Ampersands _have_ to be replaced first. If they were replaced later,
154: // other replaced characters containing ampersands would get messed up.
155: "\u0026",
156: "&", //ampersand
157: "\"",
158: """, //quotation mark
159: "\u003C",
160: "<", //less than
161: "\u003E",
162: ">", //greater than
163: "\\",
164: "\", // Backslash
165: "\u005E",
166: "^", // Caret
167:
168: "\u0060",
169: "`", // Accent Grave `
170: "\u007B",
171: "{", // {
172: "\u007C",
173: "|", // |
174: "\u007D",
175: "}", // }
176: "\u007E",
177: "~", // ~
178:
179: "\u0082",
180: "‚",
181: "\u0083",
182: "ƒ",
183: "\u0084",
184: "„",
185: "\u0085",
186: "…",
187: "\u0086",
188: "†",
189: "\u0087",
190: "‡",
191: "\u0088",
192: "ˆ",
193: "\u0089",
194: "‰",
195: "\u008A",
196: "Š",
197: "\u008B",
198: "‹",
199: "\u008C",
200: "Œ",
201: "\u008D",
202: "",
203: "\u008E",
204: "Ž",
205:
206: "\u0091",
207: "‘",
208: "\u0092",
209: "’",
210: "\u0093",
211: "“",
212: "\u0094",
213: "”",
214: "\u0095",
215: "•",
216: "\u0096",
217: "–",
218: "\u0097",
219: "—",
220: "\u0098",
221: "˜",
222: "\u0099",
223: "™",
224: "\u009A",
225: "š",
226: "\u009B",
227: "›",
228: "\u009C",
229: "œ",
230: "\u009D",
231: "",
232: "\u009E",
233: "ž",
234: "\u009F",
235: "Ÿ",
236:
237: "\u00A1",
238: "¡", //inverted (spanish) exclamation mark
239: "\u00A2",
240: "¢", //cent
241: "\u00A3",
242: "£", //pound
243: "\u00A4",
244: "¤", //currency
245: "\u00A5",
246: "¥", //yen
247: "\u00A6",
248: "¦", //broken vertical bar
249: "\u00A7",
250: "§", //section sign
251: "\u00A8",
252: "¨", //diaeresis (umlaut)
253: "\u00A9",
254: "©", //copyright sign
255: "\u00AA",
256: "ª", //feminine ordinal indicator
257: "\u00AB",
258: "«", //left-pointing double angle quotation mark
259: "\u00AC",
260: "¬", //not sign
261: "\u00AD",
262: "­", //soft hyphen
263: "\u00AE",
264: "®", //registered sign
265: "\u00AF",
266: "¯", //macron
267: "\u00B0",
268: "°", //degree sign
269: "\u00B1",
270: "±", //plus-minus sign
271: "\u00B2",
272: "²", //superscript two
273: "\u00B3",
274: "³", //superscript three
275: "\u00B4",
276: "´", //acute accent
277: "\u00B5",
278: "µ", //micro sign
279: "\u00B6",
280: "¶", //paragraph sign
281: "\u00B7",
282: "·", //middle dot
283: "\u00B8",
284: "¸", //cedilla
285: "\u00B9",
286: "¹", //superscript one
287: "\u00BA",
288: "º", //masculine ordinal indicator
289: "\u00BB",
290: "»", //right-pointing double angle quotation mark
291: "\u00BC",
292: "¼", //fraction 1/4
293: "\u00BD",
294: "½", //fraction 1/2
295: "\u00BE",
296: "¾", //fraction 3/4
297: "\u00BF",
298: "¿", //inverted (spanisch) questionmark
299: "\u00C0", "À", "\u00C1", "Á", "\u00C2",
300: "Â", "\u00C3", "Ã", "\u00C4", "Ä",
301: "\u00C5", "Å", "\u00C6", "Æ", "\u00C7",
302: "Ç", "\u00C8", "È", "\u00C9", "É",
303: "\u00CA", "Ê", "\u00CB", "Ë", "\u00CC",
304: "Ì", "\u00CD", "Í", "\u00CE", "Î",
305: "\u00CF", "Ï", "\u00D0", "Ð", "\u00D1",
306: "Ñ", "\u00D2", "Ò", "\u00D3", "Ó",
307: "\u00D4", "Ô", "\u00D5", "Õ", "\u00D6",
308: "Ö", "\u00D7", "×", "\u00D8", "Ø",
309: "\u00D9", "Ù", "\u00DA", "Ú", "\u00DB",
310: "Û", "\u00DC", "Ü", "\u00DD", "Ý",
311: "\u00DE", "Þ", "\u00DF", "ß", "\u00E0",
312: "à", "\u00E1", "á", "\u00E2", "â",
313: "\u00E3", "ã", "\u00E4", "ä", "\u00E5",
314: "å", "\u00E6", "æ", "\u00E7", "ç",
315: "\u00E8", "è", "\u00E9", "é", "\u00EA",
316: "ê", "\u00EB", "ë", "\u00EC", "ì",
317: "\u00ED", "í", "\u00EE", "î", "\u00EF",
318: "ï", "\u00F0", "ð", "\u00F1", "ñ",
319: "\u00F2", "ò", "\u00F3", "ó", "\u00F4",
320: "ô", "\u00F5", "õ", "\u00F6", "ö",
321: "\u00F7", "÷", "\u00F8", "ø", "\u00F9",
322: "ù", "\u00FA", "ú", "\u00FB", "û",
323: "\u00FC", "ü", "\u00FD", "ý", "\u00FE",
324: "þ", "\u00FF", "ÿ" };
325:
326: public static void main(String[] args) {
327: String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
328: String txet = encodeUnicode2html(text, true);
329: System.out.println(txet);
330: System.out.println(decodeHtml2Unicode(txet));
331: if (decodeHtml2Unicode(txet).equals(text))
332: System.out.println("correct");
333: }
334: }
|