001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.io.*;
041: import java.net.URL;
042:
043: /**
044: * <p>Common utilities.</p>
045: *
046: * Created by: Vladimir Nikic<br/>
047: * Date: November, 2006.
048: */
049: public class Utils {
050:
051: /**
052: * Trims specified string from left.
053: * @param s
054: */
055: public static String ltrim(String s) {
056: if (s == null) {
057: return null;
058: }
059:
060: int index = 0;
061: int len = s.length();
062:
063: while (index < len && Character.isWhitespace(s.charAt(index))) {
064: index++;
065: }
066:
067: return (index >= len) ? "" : s.substring(index);
068: }
069:
070: /**
071: * Trims specified string from right.
072: * @param s
073: */
074: public static String rtrim(String s) {
075: if (s == null) {
076: return null;
077: }
078:
079: int len = s.length();
080: int index = len;
081:
082: while (index > 0 && Character.isWhitespace(s.charAt(index - 1))) {
083: index--;
084: }
085:
086: return (index <= 0) ? "" : s.substring(0, index);
087: }
088:
089: /**
090: * Reads content from the specified URL with specified charset into string
091: * @param url
092: * @param charset
093: * @throws IOException
094: */
095: public static StringBuffer readUrl(URL url, String charset)
096: throws IOException {
097: StringBuffer buffer = new StringBuffer(1024);
098:
099: Object content = url.getContent();
100: if (content instanceof InputStream) {
101: InputStreamReader reader = new InputStreamReader(
102: (InputStream) content, charset);
103: char[] charArray = new char[1024];
104:
105: int charsRead = 0;
106: do {
107: charsRead = reader.read(charArray);
108: if (charsRead >= 0) {
109: buffer.append(charArray, 0, charsRead);
110: }
111: } while (charsRead > 0);
112: }
113:
114: return buffer;
115: }
116:
117: public static boolean isHexadecimalDigit(char ch) {
118: return Character.isDigit(ch) || ch == 'A' || ch == 'a'
119: || ch == 'B' || ch == 'b' || ch == 'C' || ch == 'c'
120: || ch == 'D' || ch == 'd' || ch == 'E' || ch == 'e'
121: || ch == 'F' || ch == 'f';
122: }
123:
124: /**
125: * Escapes XML string.
126: */
127: public static String escapeXml(String s, boolean advanced,
128: boolean recognizeUnicodeChars,
129: boolean translateSpecialEntities) {
130: if (s != null) {
131: int len = s.length();
132: StringBuffer result = new StringBuffer(len);
133:
134: for (int i = 0; i < len; i++) {
135: char ch = s.charAt(i);
136:
137: if (ch == '&') {
138: if (recognizeUnicodeChars && (i < len - 1)
139: && (s.charAt(i + 1) == '#')) {
140: int charIndex = i + 2;
141: String unicode = "";
142: while (charIndex < len
143: && (isHexadecimalDigit(s
144: .charAt(charIndex))
145: || s.charAt(charIndex) == 'x' || s
146: .charAt(charIndex) == 'X')) {
147: unicode += s.charAt(charIndex);
148: charIndex++;
149: }
150: if (charIndex == len || !"".equals(unicode)) {
151: try {
152: char unicodeChar = unicode
153: .toLowerCase().startsWith("x") ? (char) Integer
154: .parseInt(unicode.substring(1),
155: 16)
156: : (char) Integer
157: .parseInt(unicode);
158: if ("&<>\'\"".indexOf(unicodeChar) < 0) {
159: int replaceChunkSize = (charIndex < len && s
160: .charAt(charIndex) == ';') ? unicode
161: .length() + 1
162: : unicode.length();
163: result.append(String
164: .valueOf(unicodeChar));
165: i += replaceChunkSize + 1;
166: } else {
167: i = charIndex;
168: result.append("&#" + unicode
169: + ";");
170: }
171: } catch (NumberFormatException e) {
172: i = charIndex;
173: result.append("&#" + unicode + ";");
174: }
175: } else {
176: result.append("&");
177: }
178: } else {
179: if (translateSpecialEntities) {
180: // get following sequence of most 10 characters
181: String seq = s.substring(i, i
182: + Math.min(10, len - i));
183: int semiIndex = seq.indexOf(';');
184: if (semiIndex > 0) {
185: String entity = seq.substring(1,
186: semiIndex);
187: Integer code = (Integer) SpecialEntities.entities
188: .get(entity);
189: if (code != null) {
190: int entityLen = entity.length();
191: result.append((char) code
192: .intValue());
193: i += entityLen + 1;
194: continue;
195: }
196: }
197: }
198:
199: if (advanced) {
200: String sub = s.substring(i);
201: if (sub.startsWith("&")) {
202: result.append("&");
203: i += 4;
204: /*
205: } else if ( sub.startsWith("'") ) {
206: result.append("'");
207: i += 5;
208: */
209: } else if (sub.startsWith(">")) {
210: result.append(">");
211: i += 3;
212: } else if (sub.startsWith("<")) {
213: result.append("<");
214: i += 3;
215: } else if (sub.startsWith(""")) {
216: result.append(""");
217: i += 5;
218: } else {
219: result.append("&");
220: }
221:
222: continue;
223: }
224:
225: result.append("&");
226: }
227: /*
228: } else if (ch == '\'') {
229: result.append("'");
230: */
231: } else if (ch == '>') {
232: result.append(">");
233: } else if (ch == '<') {
234: result.append("<");
235: } else if (ch == '\"') {
236: result.append(""");
237: } else {
238: result.append(ch);
239: }
240: }
241:
242: return result.toString();
243: }
244:
245: return null;
246: }
247:
248: }
|