001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.io.*;
041: import java.net.URL;
042:
043: /**
044: * <p>Common utilities.</p>
045: *
046: * Created by: Vladimir Nikic<br/>
047: * Date: November, 2006.
048: */
049: public class Utils {
050:
051: /**
052: * Trims specified string from left.
053: * @param s
054: */
055: public static String ltrim(String s) {
056: if (s == null) {
057: return null;
058: }
059:
060: int index = 0;
061: int len = s.length();
062:
063: while (index < len && Character.isWhitespace(s.charAt(index))) {
064: index++;
065: }
066:
067: return (index >= len) ? "" : s.substring(index);
068: }
069:
070: /**
071: * Trims specified string from right.
072: * @param s
073: */
074: public static String rtrim(String s) {
075: if (s == null) {
076: return null;
077: }
078:
079: int len = s.length();
080: int index = len;
081:
082: while (index > 0 && Character.isWhitespace(s.charAt(index - 1))) {
083: index--;
084: }
085:
086: return (index <= 0) ? "" : s.substring(0, index);
087: }
088:
089: /**
090: * Reads content from the specified URL with specified charset into string
091: * @param url
092: * @param charset
093: * @throws IOException
094: */
095: public static StringBuffer readUrl(URL url, String charset)
096: throws IOException {
097: StringBuffer buffer = new StringBuffer(1024);
098:
099: Object content = url.getContent();
100: if (content instanceof InputStream) {
101: InputStreamReader reader = new InputStreamReader(
102: (InputStream) content, charset);
103: char[] charArray = new char[1024];
104:
105: int charsRead = 0;
106: do {
107: charsRead = reader.read(charArray);
108: if (charsRead >= 0) {
109: buffer.append(charArray, 0, charsRead);
110: }
111: } while (charsRead > 0);
112: }
113:
114: return buffer;
115: }
116:
117: public static boolean isHexadecimalDigit(char ch) {
118: return Character.isDigit(ch) || ch == 'A' || ch == 'a'
119: || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c'
120: || ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e'
121: || ch == 'F' || ch == 'f';
122: }
123:
124: /**
125: * Escapes XML string.
126: */
127: public static String escapeXml(String s, boolean advanced,
128: boolean recognizeUnicodeChars,
129: boolean translateSpecialEntities) {
130: if (s != null) {
131: int len = s.length();
132: StringBuffer result = new StringBuffer(len);
133:
134: for (int i = 0; i < len; i++) {
135: char ch = s.charAt(i);
136:
137: if (ch == '&') {
138: if ((advanced || recognizeUnicodeChars)
139: && (i < len - 1)
140: && (s.charAt(i + 1) == '#')) {
141: int charIndex = i + 2;
142: String unicode = "";
143: while (charIndex < len
144: && (isHexadecimalDigit(s
145: .charAt(charIndex))
146: || s.charAt(charIndex) == 'x' || s
147: .charAt(charIndex) == 'X')) {
148: unicode += s.charAt(charIndex);
149: charIndex++;
150: }
151: if (charIndex == len || !"".equals(unicode)) {
152: try {
153: char unicodeChar = unicode
154: .toLowerCase().startsWith("x") ? (char) Integer
155: .parseInt(unicode.substring(1),
156: 16)
157: : (char) Integer
158: .parseInt(unicode);
159: if ("&<>\'\"".indexOf(unicodeChar) < 0) {
160: int replaceChunkSize = (charIndex < len && s
161: .charAt(charIndex) == ';') ? unicode
162: .length() + 1
163: : unicode.length();
164: result
165: .append(recognizeUnicodeChars ? String
166: .valueOf(unicodeChar)
167: : "&#" + unicode
168: + ";");
169: i += replaceChunkSize + 1;
170: } else {
171: i = charIndex;
172: result.append("&#" + unicode
173: + ";");
174: }
175: } catch (NumberFormatException e) {
176: i = charIndex;
177: result.append("&#" + unicode + ";");
178: }
179: } else {
180: result.append("&");
181: }
182: } else {
183: if (translateSpecialEntities) {
184: // get following sequence of most 10 characters
185: String seq = s.substring(i, i
186: + Math.min(10, len - i));
187: int semiIndex = seq.indexOf(';');
188: if (semiIndex > 0) {
189: String entity = seq.substring(1,
190: semiIndex);
191: Integer code = (Integer) SpecialEntities.entities
192: .get(entity);
193: if (code != null) {
194: int entityLen = entity.length();
195: if (recognizeUnicodeChars) {
196: result.append((char) code
197: .intValue());
198: } else {
199: result
200: .append("&#" + code
201: + ";");
202: }
203: i += entityLen + 1;
204: continue;
205: }
206: }
207: }
208:
209: if (advanced) {
210: String sub = s.substring(i);
211: if (sub.startsWith("&")) {
212: result.append("&");
213: i += 4;
214: } else if (sub.startsWith("'")) {
215: result.append("'");
216: i += 5;
217: } else if (sub.startsWith(">")) {
218: result.append(">");
219: i += 3;
220: } else if (sub.startsWith("<")) {
221: result.append("<");
222: i += 3;
223: } else if (sub.startsWith(""")) {
224: result.append(""");
225: i += 5;
226: } else {
227: result.append("&");
228: }
229:
230: continue;
231: }
232:
233: result.append("&");
234: }
235: } else if (ch == '\'') {
236: result.append("'");
237: } else if (ch == '>') {
238: result.append(">");
239: } else if (ch == '<') {
240: result.append("<");
241: } else if (ch == '\"') {
242: result.append(""");
243: } else {
244: result.append(ch);
245: }
246: }
247:
248: return result.toString();
249: }
250:
251: return null;
252: }
253:
254: }
|