001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import java.io.*;
041: import java.util.*;
042:
043: /**
044: * Main HTML tokenizer.
045: * <p>Its taks is to parse HTML and produce list of valid tokens:
046: * open tag tokens, end tag tokens, contents (text) and comments.
047: * As soon as new item is added to token list, cleaner is invoked
048: * to clean current list at the end.</p>
049: *
050: * Created by: Vladimir Nikic.<br>
051: * Date: November, 2006
052:
053: */
054: public class HtmlTokenizer {
055:
056: private final static int WORKING_BUFFER_SIZE = 1024;
057:
058: private final static String trChString = "\u0131\u0130\u015F\u015E\u011F\u011E\u00fd\u00dd\u00fe\u00de\u00f0\u00d0\u00E7\u00C7\u00FC\u00DC\u00F6\u00D6";
059: private final static String unicode[] = { "ı", "İ",
060: "ş", "Ş", "ğ", "Ğ", "ı", "İ",
061: "ş", "Ş", "ğ", "Ğ", "ç", "Ç",
062: "ü", "Ü", "ö", "Ö" };
063:
064: private final static String dirtyChString = "\n\"&<>";
065: private final static String cleanhtmlEntities[] = { "<br/>",
066: """, "&", "<", ">" };
067:
068: private final static String clickAttributes = "onblur$onchange$onclick$ondblclick$onfocus$onkeydown$onkeypress$onkeyup$onLoad$onmousedown$onmouseout$onmouseover$onmousemove$onmouseup$onselect$onunload";
069:
070: private BufferedReader _reader;
071: private char[] _working = new char[WORKING_BUFFER_SIZE];
072:
073: private transient int _pos = 0;
074: private transient int _len = -1;
075:
076: private transient StringBuffer _saved = new StringBuffer(512);
077:
078: private transient boolean _isLateForDoctype = false;
079: private transient TagToken _currentTagToken = null;
080: private transient List _tokenList = new ArrayList();
081:
082: private boolean _asExpected = true;
083:
084: private boolean _isScriptContext = false;
085: private boolean _isStyleContext = false;
086:
087: private boolean _isTextPlain = false;
088:
089: private HtmlCleaner cleaner;
090:
091: /**
092: * Constructor - cretes instance of the parser with specified content.
093: * @param cleaner
094: * @throws IOException
095: */
096: public HtmlTokenizer(HtmlCleaner cleaner) throws IOException {
097: this ._reader = new BufferedReader(cleaner.getReader());
098: this .cleaner = cleaner;
099: }
100:
101: private void addToken(BaseToken token) {
102: _tokenList.add(token);
103: cleaner.makeTree(_tokenList, _tokenList.listIterator(_tokenList
104: .size() - 1));
105: }
106:
107: private void readIfNeeded(int neededChars) throws IOException {
108: if (_len == -1 && _pos + neededChars >= WORKING_BUFFER_SIZE) {
109: int numToCopy = WORKING_BUFFER_SIZE - _pos;
110: System.arraycopy(_working, _pos, _working, 0, numToCopy);
111: _pos = 0;
112: int size = _reader.read(_working, numToCopy,
113: WORKING_BUFFER_SIZE - numToCopy);
114:
115: if (size < WORKING_BUFFER_SIZE - numToCopy) {
116: _len = (size == -1 && numToCopy == 0) ? 0 : size
117: + numToCopy;
118: } else if (size == -1) {
119: _len = numToCopy;
120: }
121: }
122: }
123:
124: List getTokenList() {
125: return this ._tokenList;
126: }
127:
128: private void go() throws IOException {
129: _pos++;
130: readIfNeeded(0);
131: }
132:
133: private void go(int step) throws IOException {
134: _pos += step;
135: readIfNeeded(step - 1);
136: }
137:
138: /**
139: * Checks if content starts with specified value at the current position.
140: * @param value
141: * @return true if starts with specified value, false otherwise.
142: * @throws IOException
143: */
144: private boolean startsWith(String value) throws IOException {
145: int valueLen = value.length();
146: readIfNeeded(valueLen);
147: if (_len >= 0 && _pos + valueLen > _len) {
148: return false;
149: }
150:
151: for (int i = 0; i < valueLen; i++) {
152: char ch1 = Character.toLowerCase(value.charAt(i));
153: char ch2 = Character.toLowerCase(_working[_pos + i]);
154: if (ch1 != ch2) {
155: return false;
156: }
157: }
158:
159: return true;
160: }
161:
162: /**
163: * Checks if character at specified position is whitespace.
164: * @param position
165: * @return true is whitespace, false otherwise.
166: */
167: private boolean isWhitespace(int position) {
168: if (_len >= 0 && position >= _len) {
169: return false;
170: }
171:
172: return Character.isWhitespace(_working[position]);
173: }
174:
175: /**
176: * Checks if character at current runtime position is whitespace.
177: * @return true is whitespace, false otherwise.
178: */
179: private boolean isWhitespace() {
180: return isWhitespace(_pos);
181: }
182:
183: /**
184: * Checks if character at specified position is equal to specified char.
185: * @param position
186: * @param ch
187: * @return true is equals, false otherwise.
188: */
189: private boolean isChar(int position, char ch) {
190: if (_len >= 0 && position >= _len) {
191: return false;
192: }
193:
194: return Character.toLowerCase(ch) == Character
195: .toLowerCase(_working[position]);
196: }
197:
198: /**
199: * Checks if character at current runtime position is equal to specified char.
200: * @param ch
201: * @return true is equal, false otherwise.
202: */
203: private boolean isChar(char ch) {
204: return isChar(_pos, ch);
205: }
206:
207: /**
208: * Checks if character at specified position can be identifier start.
209: * @param position
210: * @return true is may be identifier start, false otherwise.
211: */
212: private boolean isIdentifierStartChar(int position) {
213: if (_len >= 0 && position >= _len) {
214: return false;
215: }
216:
217: char ch = _working[position];
218: return Character.isUnicodeIdentifierStart(ch) || (':' == ch);
219: }
220:
221: /**
222: * Checks if character at current runtime position can be identifier start.
223: * @return true is may be identifier start, false otherwise.
224: */
225: private boolean isIdentifierStartChar() {
226: return isIdentifierStartChar(_pos);
227: }
228:
229: /**
230: * Checks if character at current runtime position can be identifier part.
231: * @return true is may be identifier part, false otherwise.
232: */
233: private boolean isIdentifierChar() {
234: if (_len >= 0 && _pos >= _len) {
235: return false;
236: }
237:
238: char ch = _working[_pos];
239: return Character.isUnicodeIdentifierStart(ch)
240: || Character.isDigit(ch) || (':' == ch) || ('.' == ch)
241: || ('-' == ch);
242: }
243:
244: /**
245: * Checks if end of the content is reached.
246: */
247: private boolean isAllRead() {
248: return _len >= 0 && _pos >= _len;
249: }
250:
251: /**
252: * Saves specified character to the temporary buffer.
253: * @param ch
254: */
255: private void save(char ch) {
256: int chPos;
257: if (_isTextPlain) {
258: chPos = dirtyChString.indexOf(ch);
259: if (chPos > -1) {
260: _saved.append(cleanhtmlEntities[chPos]);
261: return;
262: }
263: }
264: chPos = trChString.indexOf(ch);
265: if (chPos > -1)
266: _saved.append(unicode[chPos]);
267: else
268: _saved.append(ch);
269: }
270:
271: /**
272: * Saves character at current runtime position to the temporary buffer.
273: */
274: private void saveCurrent() {
275: if (!isAllRead()) {
276: save(_working[_pos]);
277: }
278: }
279:
280: /**
281: * Saves specified number of characters at current runtime position to the temporary buffer.
282: * @throws IOException
283: */
284: private void saveCurrent(int size) throws IOException {
285: readIfNeeded(size);
286: int pos = _pos;
287: while (!isAllRead() && (size > 0)) {
288: save(_working[pos]);
289: pos++;
290: size--;
291: }
292: }
293:
294: /**
295: * Skips whitespaces at current position and moves foreward until
296: * non-whitespace character is found or the end of content is reached.
297: * @throws IOException
298: */
299: private void skipWhitespaces() throws IOException {
300: while (!isAllRead() && isWhitespace()) {
301: saveCurrent();
302: go();
303: }
304: }
305:
306: private void addSavedAsContent() {
307: if (_saved.length() > 0) {
308: if (!_isScriptContext)
309: addToken(new ContentToken(_saved.toString()));
310: _saved.delete(0, _saved.length());
311: }
312: }
313:
314: /**
315: * Starts parsing HTML.
316: * @throws IOException
317: */
318: void start(boolean isTextPlain) throws IOException {
319: // initialize runtime values
320: _isTextPlain = isTextPlain;
321: _currentTagToken = null;
322: _tokenList.clear();
323: _asExpected = true;
324: _isScriptContext = false;
325: _isStyleContext = false;
326: _isLateForDoctype = false;
327:
328: this ._pos = WORKING_BUFFER_SIZE;
329: readIfNeeded(0);
330:
331: while (!isAllRead()) {
332: // resets all the runtime values
333: _saved.delete(0, _saved.length());
334: _currentTagToken = null;
335: _asExpected = true;
336:
337: // this is enough for making decision
338: readIfNeeded(10);
339:
340: if (isTextPlain) {
341: if (startsWith("http://") || startsWith("https://")
342: || startsWith("www.")) {
343: tagAnchorStart();
344: } else {
345: content();
346: }
347: } else if (_isScriptContext) {
348: if (startsWith("</script")
349: && (isWhitespace(_pos + 8) || isChar(_pos + 8,
350: '>'))) {
351: tagEnd();
352: } else {
353: content();
354: }
355: } else if (_isStyleContext) {
356: if (startsWith("</style")
357: && (isWhitespace(_pos + 7) || isChar(_pos + 7,
358: '>'))) {
359: tagEnd();
360: } else {
361: content();
362: }
363: } else {
364: if (startsWith("<!doctype")) {
365: if (!_isLateForDoctype) {
366: doctype();
367: _isLateForDoctype = true;
368: } else {
369: ignore();
370: }
371: } else if (startsWith("</")
372: && isIdentifierStartChar(_pos + 2)) {
373: _isLateForDoctype = true;
374: tagEnd();
375: } else if (startsWith("<!--")) {
376: comment();
377: } else if (startsWith("<")
378: && isIdentifierStartChar(_pos + 1)) {
379: _isLateForDoctype = true;
380: tagStart();
381: } else {
382: content();
383: }
384: }
385: }
386:
387: _reader.close();
388: }
389:
390: /**
391: * Parses start of the tag.
392: * It expects that current position is at the "<" after which
393: * the tag's name follows.
394: * @throws IOException
395: */
396: private void tagStart() throws IOException {
397: saveCurrent();
398: go();
399:
400: if (isAllRead()) {
401: return;
402: }
403:
404: String tagName = identifier();
405: _currentTagToken = new TagNode(tagName);
406:
407: if (_asExpected) {
408: skipWhitespaces();
409: tagAttributes();
410:
411: String originalSource = _saved.toString();
412: if (isChar('>')) {
413: go();
414: if ("script".equalsIgnoreCase(tagName)) {
415: _isScriptContext = true;
416: } else if ("style".equalsIgnoreCase(tagName)) {
417: _isStyleContext = true;
418: } else if ("a".equalsIgnoreCase(tagName)
419: || "area".equalsIgnoreCase(tagName)) {
420: int pos = originalSource.toLowerCase(
421: new Locale("en", "US")).indexOf("mailto:");
422: if (pos > 0) {
423: if (originalSource.length() > pos + 7) {
424: char ch = originalSource.charAt(pos - 1);
425: if (ch == '\'' || ch == '\"') {
426: int posEnd = originalSource.indexOf(ch,
427: pos + 7);
428: if (posEnd > pos + 7)
429: _currentTagToken
430: .addAttribute(
431: "href",
432: "javascript:parent.compose.fastEmail('"
433: + originalSource
434: .substring(
435: pos + 7,
436: posEnd)
437: + "');");
438: }
439: }
440: } else
441: _currentTagToken.addAttribute("target",
442: "_blank");
443: }
444: originalSource += ">";
445: } else if (startsWith("/>")) {
446: go(2);
447: addToken(new EndTagToken(tagName));
448: originalSource += "/>";
449: }
450:
451: if (!_isScriptContext) {
452: addToken(_currentTagToken);
453: _currentTagToken.setOriginalSource(originalSource);
454: }
455:
456: _currentTagToken = null;
457: } else {
458: addSavedAsContent();
459: }
460: }
461:
462: /**
463: * Parses start of the tag.
464: * It expects that current position is at the "http://" or "www." after which
465: * the tag's name follows.
466: * @throws IOException
467: */
468: private void tagAnchorStart() throws IOException {
469: while (!isAllRead() && !isWhitespace()) {
470: saveCurrent();
471: go();
472: }
473:
474: String href = _saved.toString();
475: if (href.startsWith("www."))
476: href = "http://" + href;
477:
478: TagNode anchorNode = new TagNode("A");
479: anchorNode.addAttribute("href", href);
480: anchorNode.addAttribute("target", "_blank");
481:
482: addToken(anchorNode);
483: addSavedAsContent();
484: addToken(new EndTagToken("A"));
485:
486: anchorNode = null;
487:
488: skipWhitespaces();
489: addSavedAsContent();
490: }
491:
492: /**
493: * Parses start of the tag.
494: * It expects that current position is at the "@" after which
495: * the tag's name follows.
496: * @throws IOException
497: */
498: private void tagMailStart() throws IOException {
499: while (!isAllRead() && !isWhitespace() && !isChar('>')) {
500: saveCurrent();
501: go();
502: }
503:
504: TagNode anchorNode = new TagNode("A");
505: anchorNode.addAttribute("href", "javascript:parent.fastEmail('"
506: + _saved.toString() + "');");
507:
508: addToken(anchorNode);
509: addSavedAsContent();
510: addToken(new EndTagToken("A"));
511:
512: anchorNode = null;
513:
514: skipWhitespaces();
515: addSavedAsContent();
516: }
517:
518: /**
519: * Parses end of the tag.
520: * It expects that current position is at the "<" after which
521: * "/" and the tag's name follows.
522: * @throws IOException
523: */
524: private void tagEnd() throws IOException {
525: saveCurrent(2);
526: go(2);
527:
528: if (isAllRead()) {
529: return;
530: }
531:
532: String tagName = identifier();
533: _currentTagToken = new EndTagToken(tagName);
534:
535: if (_asExpected) {
536: skipWhitespaces();
537: tagAttributes();
538:
539: String originalSource = _saved.toString();
540: addToken(_currentTagToken);
541:
542: if (isChar('>')) {
543: go();
544: originalSource += ">";
545: }
546:
547: if ("script".equalsIgnoreCase(tagName)) {
548: _isScriptContext = false;
549: } else if ("style".equalsIgnoreCase(tagName)) {
550: _isStyleContext = false;
551: }
552:
553: _currentTagToken.setOriginalSource(originalSource);
554: _currentTagToken = null;
555: } else {
556: addSavedAsContent();
557: }
558: }
559:
560: /**
561: * Parses an identifier from the current position.
562: * @throws IOException
563: */
564: private String identifier() throws IOException {
565: _asExpected = true;
566:
567: if (!isIdentifierStartChar()) {
568: _asExpected = false;
569: return null;
570: }
571:
572: StringBuffer tagName = new StringBuffer(16);
573:
574: while (!isAllRead() && isIdentifierChar()) {
575: saveCurrent();
576: tagName.append(_working[_pos]);
577: go();
578: }
579:
580: return tagName.toString();
581: }
582:
583: /**
584: * Parses list tag attributes from the current position.
585: * @throws IOException
586: */
587: private void tagAttributes() throws IOException {
588: while (!isAllRead() && _asExpected && !isChar('>')
589: && !startsWith("/>")) {
590: skipWhitespaces();
591: String attName = identifier();
592:
593: if (!_asExpected) {
594: if (!isChar('<') && !isChar('>') && !startsWith("/>")) {
595: saveCurrent();
596: go();
597: }
598:
599: if (!isChar('<')) {
600: _asExpected = true;
601: }
602:
603: continue;
604: }
605:
606: String attValue = attName;
607:
608: skipWhitespaces();
609: if (isChar('=')) {
610: saveCurrent();
611: go();
612: attValue = attributeValue();
613: }
614:
615: if (-1 != clickAttributes.indexOf(attName
616: .toLowerCase(new Locale("en", "US"))))
617: attValue = "return false;";
618: if (_asExpected) {
619: _currentTagToken.addAttribute(attName, attValue);
620: }
621: }
622: }
623:
624: /**
625: * Parses a single tag attribute - it is expected to be in one of the forms:
626: * name=value
627: * name="value"
628: * name='value'
629: * name
630: * @throws IOException
631: */
632: private String attributeValue() throws IOException {
633: skipWhitespaces();
634:
635: if (isChar('<') || isChar('>') || startsWith("/>")) {
636: return "";
637: }
638:
639: boolean isQuoteMode = false;
640: boolean isAposMode = false;
641:
642: StringBuffer result = new StringBuffer();
643:
644: if (isChar('\'')) {
645: isAposMode = true;
646: saveCurrent();
647: go();
648: } else if (isChar('\"')) {
649: isQuoteMode = true;
650: saveCurrent();
651: go();
652: }
653:
654: while (!isAllRead()
655: && ((isAposMode && !isChar('\''))
656: || (isQuoteMode && !isChar('\"')) || (!isAposMode
657: && !isQuoteMode
658: && !isWhitespace()
659: && !isChar('>') && !startsWith("/>")))) {
660: result.append(_working[_pos]);
661: saveCurrent();
662: go();
663: }
664:
665: if (isChar('\'') && isAposMode) {
666: saveCurrent();
667: go();
668: } else if (isChar('\"') && isQuoteMode) {
669: saveCurrent();
670: go();
671: }
672:
673: return result.toString();
674: }
675:
676: private void content() throws IOException {
677: while (!isAllRead()) {
678: saveCurrent();
679: go();
680:
681: if (_isTextPlain) {
682: if (isWhitespace()) {
683: skipWhitespaces();
684: break;
685: } else if (isChar('<')) {
686: saveCurrent();
687: go();
688: break;
689: } else if (_working[_pos] == '@') {
690: tagMailStart();
691: return;
692: }
693: } else {
694: if (isChar('<'))
695: break;
696: }
697: }
698:
699: addSavedAsContent();
700: }
701:
702: private void ignore() throws IOException {
703: while (!isAllRead()) {
704: go();
705: if (isChar('<')) {
706: break;
707: }
708: }
709: }
710:
711: private void comment() throws IOException {
712: go(4);
713: while (!isAllRead() && !startsWith("-->")) {
714: saveCurrent();
715: go();
716: }
717:
718: if (startsWith("-->")) {
719: go(3);
720: }
721:
722: if (_saved.length() > 0) {
723: if (!cleaner.isOmitComments()) {
724: String hyphenRepl = cleaner
725: .getHyphenReplacementInComment();
726: String comment = _saved.toString().replaceAll("--",
727: hyphenRepl + hyphenRepl);
728:
729: if (comment.length() > 0 && comment.charAt(0) == '-') {
730: comment = hyphenRepl + comment.substring(1);
731: }
732: int len = comment.length();
733: if (len > 0 && comment.charAt(len - 1) == '-') {
734: comment = comment.substring(0, len - 1)
735: + hyphenRepl;
736: }
737:
738: addToken(new CommentToken(comment));
739: }
740: _saved.delete(0, _saved.length());
741: }
742: }
743:
744: private void doctype() throws IOException {
745: go(9);
746:
747: skipWhitespaces();
748: String part1 = identifier();
749: skipWhitespaces();
750: String part2 = identifier();
751: skipWhitespaces();
752: String part3 = attributeValue();
753: skipWhitespaces();
754: String part4 = attributeValue();
755:
756: ignore();
757:
758: DoctypeToken _docType = new DoctypeToken(part1, part2, part3,
759: part4);
760:
761: if (_docType.isValid()) {
762: cleaner.setDoctype(_docType);
763: }
764: }
765:
766: }
|