001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import org.w3c.dom.Document;
041:
042: import javax.xml.parsers.ParserConfigurationException;
043: import java.io.*;
044: import java.net.URL;
045: import java.util.*;
046:
047: /**
048: * Main HtmlCleaner class.
049: *
050: * <p>It represents public interface to the user. It's task is to call tokenizer with
051: * specified source HTML, traverse list of produced token list and create internal
052: * object model. It also offers a set of methods to write resulting XML to string,
053: * file or any output stream.</p>
054: * <p>Typical usage is the following:</p>
055: *
056: * <xmp>
057: * HtmlCleaner cleaner = new HtmlCleaner(...); // one of few constructors
058: * cleaner.setXXX(...) // optionally, set cleaner's behaviour
059: * clener.clean(); // calls cleaning process
060: * cleaner.writeXmlXXX(...); // writes resulting XML to string, file or any output stream
061: * // cleaner.createDOM(); // writes resulting XML to string, file or any output stream
062: * </xmp>
063: *
064: * Created by: Vladimir Nikic <br/>
065: * Date: November, 2006
066: */
067: public class HtmlCleaner {
068:
069: public static final String DEFAULT_CHARSET = System
070: .getProperty("file.encoding");
071:
072: private static final int WRITE_METHOD_SIMPLE = 0;
073: private static final int WRITE_METHOD_COMPACT = 1;
074: private static final int WRITE_METHOD_PRETTY = 2;
075:
076: /**
077: * Contains information about single open tag
078: */
079: private class TagPos {
080: private int position;
081: private String name;
082: private TagInfo info;
083:
084: TagPos(int position, String name) {
085: this .position = position;
086: this .name = name;
087: this .info = tagInfoProvider.getTagInfo(name);
088: }
089: }
090:
091: /**
092: * Class that contains information and mathods for managing list of open,
093: * but unhandled tags.
094: */
095: private class OpenTags {
096: private List list = new ArrayList();
097: private TagPos last = null;
098: private Set set = new HashSet();
099:
100: private boolean isEmpty() {
101: return list.isEmpty();
102: }
103:
104: private void addTag(String tagName, int position) {
105: last = new TagPos(position, tagName);
106: list.add(last);
107: set.add(tagName);
108: }
109:
110: private void removeTag(String tagName) {
111: ListIterator it = list.listIterator(list.size());
112: while (it.hasPrevious()) {
113: TagPos currTagPos = (TagPos) it.previous();
114: if (tagName.equals(currTagPos.name)) {
115: it.remove();
116: break;
117: }
118: }
119:
120: last = list.isEmpty() ? null : (TagPos) list.get(list
121: .size() - 1);
122: }
123:
124: private TagPos findFirstTagPos() {
125: return list.isEmpty() ? null : (TagPos) list.get(0);
126: }
127:
128: private TagPos getLastTagPos() {
129: return last;
130: }
131:
132: private TagPos findTag(String tagName) {
133: if (tagName != null) {
134: ListIterator it = list.listIterator(list.size());
135: while (it.hasPrevious()) {
136: TagPos currTagPos = (TagPos) it.previous();
137: if (tagName.equals(currTagPos.name)) {
138: return currTagPos;
139: }
140: }
141: }
142:
143: return null;
144: }
145:
146: private boolean tagExists(String tagName) {
147: TagPos tagPos = findTag(tagName);
148: return tagPos != null;
149: }
150:
151: private TagPos findTagToPlaceRubbish() {
152: TagPos result = null, prev = null;
153:
154: if (!isEmpty()) {
155: ListIterator it = list.listIterator(list.size());
156: while (it.hasPrevious()) {
157: result = (TagPos) it.previous();
158: if (result.info == null
159: || result.info.allowsAnything()) {
160: if (prev != null) {
161: return prev;
162: }
163: }
164: prev = result;
165: }
166: }
167:
168: return result;
169: }
170:
171: private boolean tagEncountered(String tagName) {
172: return set.contains(tagName);
173: }
174:
175: /**
176: * Checks if any of tags specified in the set are already open.
177: * @param tags
178: */
179: private boolean someAlreadyOpen(Set tags) {
180: Iterator it = list.iterator();
181: while (it.hasNext()) {
182: TagPos curr = (TagPos) it.next();
183: if (tags.contains(curr.name)) {
184: return true;
185: }
186: }
187:
188: return false;
189: }
190: }
191:
192: private ITagInfoProvider tagInfoProvider;
193:
194: private Reader reader;
195: private transient OpenTags _openTags = new OpenTags();
196: private transient DoctypeToken _docType = null;
197: private Set allTags = new TreeSet();
198:
199: private boolean advancedXmlEscape = true;
200: private boolean useCdataForScriptAndStyle = true;
201: private boolean translateSpecialEntities = true;
202: private boolean recognizeUnicodeChars = true;
203: private boolean omitUnknownTags = false;
204: private boolean omitDeprecatedTags = false;
205: private boolean omitComments = false;
206: private boolean omitXmlDeclaration = false;
207: private boolean omitDoctypeDeclaration = true;
208: private boolean omitXmlnsAttributes = false;
209: private String hyphenReplacementInComment = "=";
210:
211: private TagNode htmlNode;
212: private TagNode bodyNode;
213: private TagNode headNode;
214: private TagNode styleNode;
215:
216: /**
217: * Constructor - creates the instance with specified html
218: * content as String.
219: * @param htmlContent
220: */
221: public HtmlCleaner(String htmlContent,
222: ITagInfoProvider tagInfoProvider) {
223: this .reader = new StringReader(htmlContent);
224: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
225: .getInstance()
226: : tagInfoProvider;
227: }
228:
229: /**
230: * Constructor - creates the instance with specified html
231: * content as String.
232: * @param htmlContent
233: */
234: public HtmlCleaner(String htmlContent) {
235: this (htmlContent, HtmlTagProvider.getInstance());
236: }
237:
238: /**
239: * Constructor - creates the instance for specified file.
240: * @param file
241: * @param charset
242: * @throws IOException
243: */
244: public HtmlCleaner(File file, String charset,
245: ITagInfoProvider tagInfoProvider) throws IOException {
246: FileInputStream in = new FileInputStream(file);
247: this .reader = new InputStreamReader(in, charset);
248: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
249: .getInstance()
250: : tagInfoProvider;
251: }
252:
253: /**
254: * Constructor - creates the instance for specified file.
255: * @param file
256: * @param charset
257: * @throws IOException
258: */
259: public HtmlCleaner(File file, String charset) throws IOException {
260: this (file, charset, HtmlTagProvider.getInstance());
261: }
262:
263: /**
264: * Constructor - creates the instance for specified file and charset.
265: * @param file
266: * @throws IOException
267: */
268: public HtmlCleaner(File file, ITagInfoProvider tagInfoProvider)
269: throws IOException {
270: this (file, DEFAULT_CHARSET, tagInfoProvider);
271: }
272:
273: /**
274: * Constructor - creates the instance for specified file and charset.
275: * @param file
276: * @throws IOException
277: */
278: public HtmlCleaner(File file) throws IOException {
279: this (file, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
280: }
281:
282: /**
283: * Constructor - creates the instance for specified URL and charset.
284: * @param url
285: * @param charset
286: * @throws IOException
287: */
288: public HtmlCleaner(URL url, String charset,
289: ITagInfoProvider tagInfoProvider) throws IOException {
290: StringBuffer content = Utils.readUrl(url, charset);
291: this .reader = new StringReader(content.toString());
292: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
293: .getInstance()
294: : tagInfoProvider;
295: }
296:
297: /**
298: * Constructor - creates the instance for specified URL and charset.
299: * @param url
300: * @param tagInfoProvider
301: * @throws IOException
302: */
303: public HtmlCleaner(URL url, ITagInfoProvider tagInfoProvider)
304: throws IOException {
305: this (url, DEFAULT_CHARSET, tagInfoProvider);
306: }
307:
308: /**
309: * Constructor - creates the instance for specified URL and charset.
310: * @param url
311: * @param charset
312: * @throws IOException
313: */
314: public HtmlCleaner(URL url, String charset) throws IOException {
315: this (url, charset, HtmlTagProvider.getInstance());
316: }
317:
318: /**
319: * Constructor - creates the instance for specified URL and charset.
320: * @param url
321: * @throws IOException
322: */
323: public HtmlCleaner(URL url) throws IOException {
324: this (url, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
325: }
326:
327: /**
328: * Constructor - creates the instance for the specified inpout stream
329: * @param in
330: * @param tagInfoProvider
331: */
332: public HtmlCleaner(InputStream in, ITagInfoProvider tagInfoProvider) {
333: this .reader = new InputStreamReader(in);
334: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
335: .getInstance()
336: : tagInfoProvider;
337: }
338:
339: /**
340: * Constructor - creates the instance for the specified inpout stream
341: * @param in
342: */
343: public HtmlCleaner(InputStream in) {
344: this (in, HtmlTagProvider.getInstance());
345: }
346:
347: DoctypeToken getDoctype() {
348: return _docType;
349: }
350:
351: void setDoctype(DoctypeToken type) {
352: _docType = type;
353: }
354:
355: /**
356: * Constructor - creates the instance for the specified inpout stream
357: * and the charset
358: * @param in
359: * @param charset
360: * @throws IOException
361: */
362: public HtmlCleaner(InputStream in, String charset)
363: throws IOException {
364: reader = new InputStreamReader(in, charset);
365: }
366:
367: public void clean(boolean isTextPlain, boolean addStyleSheet)
368: throws IOException {
369: allTags.clear();
370:
371: htmlNode = new TagNode("html");
372: bodyNode = new TagNode("body");
373: headNode = new TagNode("head");
374: if (addStyleSheet) {
375: if (isTextPlain) {
376: styleNode = new TagNode("link");
377: styleNode.addAttribute("href", "../css/preview.css");
378: styleNode.addAttribute("rel", "stylesheet");
379: styleNode.addAttribute("type", "text/css");
380: headNode.addChild(styleNode);
381: }
382: }
383: htmlNode.addChild(headNode);
384: htmlNode.addChild(bodyNode);
385:
386: HtmlTokenizer htmlTokenizer = new HtmlTokenizer(this );
387:
388: htmlTokenizer.start(isTextPlain);
389:
390: List nodeList = htmlTokenizer.getTokenList();
391: closeAll(nodeList);
392: createDocumentNodes(nodeList);
393: }
394:
395: Reader getReader() {
396: return reader;
397: }
398:
399: /**
400: * Add attributes from specified map to the specified tag.
401: * If some attribute already exist it is preserved.
402: * @param tag
403: * @param attributes
404: */
405: private void addAttributesToTag(TagNode tag, Map attributes) {
406: if (attributes != null) {
407: Map tagAttributes = tag.getAttributes();
408: Iterator it = attributes.entrySet().iterator();
409: while (it.hasNext()) {
410: Map.Entry currEntry = (Map.Entry) it.next();
411: String attName = (String) currEntry.getKey();
412: if (!tagAttributes.containsKey(attName)) {
413: String attValue = (String) currEntry.getValue();
414: tag.addAttribute(attName, attValue);
415: }
416: }
417: }
418: }
419:
420: /**
421: * Checks if open fatal tag is missing if there is a fatal tag for
422: * the specified tag.
423: * @param tag
424: */
425: private boolean isFatalTagSatisfied(TagInfo tag) {
426: if (tag != null) {
427: String fatalTagName = tag.getFatalTag();
428: return fatalTagName == null ? true : _openTags
429: .tagExists(fatalTagName);
430: }
431:
432: return true;
433: }
434:
435: /**
436: * Check if specified tag requires parent tag, but that parent
437: * tag is missing in the appropriate context.
438: * @param tag
439: */
440: private boolean mustAddRequiredParent(TagInfo tag) {
441: if (tag != null) {
442: String requiredParent = tag.getRequiredParent();
443: if (requiredParent != null) {
444: String fatalTag = tag.getFatalTag();
445: int fatalTagPositon = -1;
446: if (fatalTag != null) {
447: TagPos tagPos = _openTags.findTag(fatalTag);
448: if (tagPos != null) {
449: fatalTagPositon = tagPos.position;
450: }
451: }
452:
453: // iterates through the list of open tags from the end and check if there is some higher
454: ListIterator it = _openTags.list
455: .listIterator(_openTags.list.size());
456: while (it.hasPrevious()) {
457: TagPos currTagPos = (TagPos) it.previous();
458: if (tag.isHigher(currTagPos.name)) {
459: return currTagPos.position <= fatalTagPositon;
460: }
461: }
462:
463: return true;
464: }
465: }
466:
467: return false;
468: }
469:
470: private TagNode createTagNode(TagNode startTagToken) {
471: startTagToken.setFormed();
472: return startTagToken;
473: }
474:
475: private boolean isAllowedInLastOpenTag(BaseToken token) {
476: TagPos last = _openTags.getLastTagPos();
477: if (last != null) {
478: if (last.info != null) {
479: return last.info.allowsItem(token);
480: }
481: }
482:
483: return true;
484: }
485:
486: private void saveToLastOpenTag(List nodeList, Object tokenToAdd) {
487: TagPos last = _openTags.getLastTagPos();
488: if (last != null && last.info != null
489: && last.info.isIgnorePermitted()) {
490: return;
491: }
492:
493: TagPos rubbishPos = _openTags.findTagToPlaceRubbish();
494: if (rubbishPos != null) {
495: TagNode startTagToken = (TagNode) nodeList
496: .get(rubbishPos.position);
497: startTagToken.addItemForMoving(tokenToAdd);
498: }
499: }
500:
501: private boolean isStartToken(Object o) {
502: return (o instanceof TagNode) && !((TagNode) o).isFormed();
503: }
504:
505: void makeTree(List nodeList, ListIterator nodeIterator) {
506: // process while not reach the end of the list
507: while (nodeIterator.hasNext()) {
508: BaseToken token = (BaseToken) nodeIterator.next();
509:
510: if (token instanceof EndTagToken) {
511: EndTagToken endTagToken = (EndTagToken) token;
512: String tagName = endTagToken.getName();
513: TagInfo tag = tagInfoProvider.getTagInfo(tagName);
514:
515: if ((tag == null && omitUnknownTags)
516: || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
517: nodeIterator.set(null);
518: } else if (tag != null && !tag.allowsBody()) {
519: nodeIterator.set(null);
520: } else {
521: TagPos matchingPosition = _openTags
522: .findTag(tagName);
523:
524: if (matchingPosition != null) {
525: closeSnippet(nodeList, matchingPosition,
526: endTagToken);
527: } else if (!isAllowedInLastOpenTag(token)) {
528: saveToLastOpenTag(nodeList, token);
529: }
530:
531: nodeIterator.set(null);
532: }
533: } else if (isStartToken(token)) {
534: TagNode startTagToken = (TagNode) token;
535: String tagName = startTagToken.getName();
536: TagInfo tag = tagInfoProvider.getTagInfo(tagName);
537:
538: // add tag to set of all tags
539: allTags.add(tagName);
540:
541: // HTML open tag
542: if ("html".equals(tagName)) {
543: addAttributesToTag(htmlNode, startTagToken
544: .getAttributes());
545: nodeIterator.set(null);
546: // BODY open tag
547: } else if ("body".equals(tagName)) {
548: addAttributesToTag(bodyNode, startTagToken
549: .getAttributes());
550: nodeIterator.set(null);
551: // HEAD open tag
552: } else if ("head".equals(tagName)) {
553: addAttributesToTag(headNode, startTagToken
554: .getAttributes());
555: nodeIterator.set(null);
556: // unknows HTML tag and unknown tags are not allowed
557: } else if ((tag == null && omitUnknownTags)
558: || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
559: nodeIterator.set(null);
560: } else if (tag != null
561: && tag.hasPermittedTags()
562: && _openTags.someAlreadyOpen(tag
563: .getPermittedTags())) {
564: nodeIterator.set(null);
565: // if tag that must be unique, ignore this occurence
566: } else if (tag != null && tag.isUnique()
567: && _openTags.tagEncountered(tagName)) {
568: nodeIterator.set(null);
569: // if there is no required outer tag without that this open tag is ignored
570: } else if (!isFatalTagSatisfied(tag)) {
571: nodeIterator.set(null);
572: // if there is no required parent tag - it must be added before this open tag
573: } else if (mustAddRequiredParent(tag)) {
574: String requiredParent = tag.getRequiredParent();
575: TagNode requiredParentStartToken = new TagNode(
576: requiredParent);
577: nodeIterator.previous();
578: nodeIterator.add(requiredParentStartToken);
579: nodeIterator.previous();
580: // if last open tag has lower presidence then this, it must be closed
581: } else if (tag != null
582: && !_openTags.isEmpty()
583: && tag
584: .isMustCloseTag(tagInfoProvider
585: .getTagInfo(_openTags
586: .getLastTagPos().name))) {
587: List closed = closeSnippet(nodeList, _openTags
588: .getLastTagPos(), startTagToken);
589: int closedCount = closed.size();
590:
591: // it is needed to copy some tags again in front of current, if there are any
592: if (tag.hasCopyTags() && closedCount > 0) {
593: // first iterates over list from the back and collects all start tokens
594: // in sequence that must be copied
595: ListIterator closedIt = closed
596: .listIterator(closedCount);
597: List toBeCopied = new ArrayList();
598: while (closedIt.hasPrevious()) {
599: TagNode currStartToken = (TagNode) closedIt
600: .previous();
601: if (tag.isCopy(currStartToken.getName())) {
602: toBeCopied.add(0, currStartToken);
603: } else {
604: break;
605: }
606: }
607:
608: if (toBeCopied.size() > 0) {
609: Iterator copyIt = toBeCopied.iterator();
610: while (copyIt.hasNext()) {
611: TagNode currStartToken = (TagNode) copyIt
612: .next();
613: nodeIterator.add(currStartToken
614: .makeCopy());
615: }
616:
617: // back to the previous place, before adding new start tokens
618: for (int i = 0; i < toBeCopied.size(); i++) {
619: nodeIterator.previous();
620: }
621: }
622: }
623:
624: nodeIterator.previous();
625: // if this open tag is not allowed inside last open tag, then it must be moved to the place where it can be
626: } else if (!isAllowedInLastOpenTag(token)) {
627: saveToLastOpenTag(nodeList, token);
628: nodeIterator.set(null);
629: // if it is known HTML tag but doesn't allow body, it is immidiately closed
630: } else if (tag != null && !tag.allowsBody()) {
631: TagNode newTagNode = createTagNode(startTagToken);
632: if (tag.isHeadTag()) {
633: headNode.addChild(newTagNode);
634: nodeIterator.set(null);
635: } else {
636: nodeIterator.set(newTagNode);
637: }
638: // default case - just remember this open tag and go further
639: } else {
640: _openTags.addTag(tagName, nodeIterator
641: .previousIndex());
642: }
643: } else {
644: if (!isAllowedInLastOpenTag(token)) {
645: saveToLastOpenTag(nodeList, token);
646: nodeIterator.set(null);
647: }
648: }
649: }
650: }
651:
652: private void createDocumentNodes(List listNodes) {
653: Iterator it = listNodes.iterator();
654: while (it.hasNext()) {
655: Object child = it.next();
656:
657: if (child == null) {
658: continue;
659: }
660:
661: TagNode parent = bodyNode;
662: boolean toAdd = true;
663:
664: if (child instanceof TagNode) {
665: TagInfo tag = tagInfoProvider
666: .getTagInfo(((TagNode) child).getName());
667: if (tag != null) {
668: if (tag.isHeadTag()
669: || (tag.isHeadAndBodyTag() && bodyNode
670: .getChildren().isEmpty())) {
671: parent = headNode;
672: }
673: }
674: } else {
675: if (child instanceof ContentToken) {
676: toAdd = !"".equals(((ContentToken) child)
677: .toString());
678: }
679: }
680:
681: if (toAdd) {
682: parent.addChild(child);
683: }
684: }
685: }
686:
687: private List closeSnippet(List nodeList, TagPos tagPos,
688: Object toNode) {
689: List closed = new ArrayList();
690: ListIterator it = nodeList.listIterator(tagPos.position);
691:
692: TagNode tagNode = null;
693: Object item = it.next();
694: boolean isListEnd = false;
695:
696: while ((toNode == null && !isListEnd)
697: || (toNode != null && item != toNode)) {
698: if (isStartToken(item)) {
699: TagNode startTagToken = (TagNode) item;
700: closed.add(startTagToken);
701: List itemsToMove = startTagToken.getItemsToMove();
702: if (itemsToMove != null) {
703: OpenTags prevOpenTags = _openTags;
704: _openTags = new OpenTags();
705: makeTree(itemsToMove, itemsToMove.listIterator(0));
706: closeAll(itemsToMove);
707: startTagToken.setItemsToMove(null);
708: _openTags = prevOpenTags;
709: }
710:
711: TagNode newTagNode = createTagNode(startTagToken);
712:
713: TagInfo tag = tagInfoProvider.getTagInfo(newTagNode
714: .getName());
715: if (tag != null && tag.isHeadTag()) {
716: headNode.addChild(newTagNode);
717: it.set(null);
718: } else if (tagNode != null) {
719: tagNode.addChildren(itemsToMove);
720: tagNode.addChild(newTagNode);
721: it.set(null);
722: } else {
723: if (itemsToMove != null) {
724: itemsToMove.add(newTagNode);
725: it.set(itemsToMove);
726: } else {
727: it.set(newTagNode);
728: }
729: }
730:
731: _openTags.removeTag(newTagNode.getName());
732: tagNode = newTagNode;
733: } else {
734: if (tagNode != null) {
735: it.set(null);
736: if (item != null) {
737: tagNode.addChild(item);
738: }
739: }
740: }
741:
742: if (it.hasNext()) {
743: item = it.next();
744: } else {
745: isListEnd = true;
746: }
747: }
748:
749: return closed;
750: }
751:
752: /**
753: * Close all unclosed tags if there are any.
754: */
755: private void closeAll(List nodeList) {
756: TagPos firstTagPos = _openTags.findFirstTagPos();
757: if (firstTagPos != null) {
758: closeSnippet(nodeList, firstTagPos, null);
759: }
760: }
761:
762: // setters and getters
763:
764: public boolean isOmitUnknownTags() {
765: return omitUnknownTags;
766: }
767:
768: public void setOmitUnknownTags(boolean omitUnknownTags) {
769: this .omitUnknownTags = omitUnknownTags;
770: }
771:
772: public boolean isOmitDeprecatedTags() {
773: return omitDeprecatedTags;
774: }
775:
776: public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
777: this .omitDeprecatedTags = omitDeprecatedTags;
778: }
779:
780: public boolean isAdvancedXmlEscape() {
781: return advancedXmlEscape;
782: }
783:
784: public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
785: this .advancedXmlEscape = advancedXmlEscape;
786: }
787:
788: public boolean isUseCdataForScriptAndStyle() {
789: return useCdataForScriptAndStyle;
790: }
791:
792: public void setUseCdataForScriptAndStyle(
793: boolean useCdataForScriptAndStyle) {
794: this .useCdataForScriptAndStyle = useCdataForScriptAndStyle;
795: }
796:
797: public boolean isTranslateSpecialEntities() {
798: return translateSpecialEntities;
799: }
800:
801: public void setTranslateSpecialEntities(
802: boolean translateSpecialEntities) {
803: this .translateSpecialEntities = translateSpecialEntities;
804: }
805:
806: public boolean isRecognizeUnicodeChars() {
807: return recognizeUnicodeChars;
808: }
809:
810: public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
811: this .recognizeUnicodeChars = recognizeUnicodeChars;
812: }
813:
814: public boolean isOmitComments() {
815: return omitComments;
816: }
817:
818: public void setOmitComments(boolean omitComments) {
819: this .omitComments = omitComments;
820: }
821:
822: public boolean isOmitXmlDeclaration() {
823: return omitXmlDeclaration;
824: }
825:
826: public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
827: this .omitXmlDeclaration = omitXmlDeclaration;
828: }
829:
830: public boolean isOmitDoctypeDeclaration() {
831: return omitDoctypeDeclaration;
832: }
833:
834: public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
835: this .omitDoctypeDeclaration = omitDoctypeDeclaration;
836: }
837:
838: public boolean isOmitXmlnsAttributes() {
839: return omitXmlnsAttributes;
840: }
841:
842: public void setOmitXmlnsAttributes(boolean omitXmlnsAttributes) {
843: this .omitXmlnsAttributes = omitXmlnsAttributes;
844: }
845:
846: public String getHyphenReplacementInComment() {
847: return hyphenReplacementInComment;
848: }
849:
850: public void setHyphenReplacementInComment(
851: String hyphenReplacementInComment) {
852: this .hyphenReplacementInComment = hyphenReplacementInComment;
853: }
854:
855: public Set getAllTags() {
856: return allTags;
857: }
858:
859: // methods for creating result
860:
861: /**
862: * Creates XML DOM document object.
863: * @return Instance of org.w3c.dom.Document
864: */
865: public Document createDOM() throws ParserConfigurationException {
866: DomSerializer domSerializer = new DomSerializer();
867: return domSerializer.createDOM(htmlNode);
868: }
869:
870: /**
871: * The most general way to serialize resulting XML.
872: * @param xmlSerializer
873: * @throws IOException
874: */
875: public void writeXml(XmlSerializer xmlSerializer)
876: throws IOException {
877: xmlSerializer.createXml(htmlNode);
878: }
879:
880: private void writeXml(Writer writer, int method) throws IOException {
881: XmlSerializer xmlSerializer = null;
882:
883: if (WRITE_METHOD_COMPACT == method) {
884: xmlSerializer = new CompactXmlSerializer(writer, this );
885: } else if (WRITE_METHOD_PRETTY == method) {
886: xmlSerializer = new PrettyXmlSerializer(writer, this );
887: } else {
888: xmlSerializer = new SimpleXmlSerializer(writer, this );
889: }
890:
891: xmlSerializer.createXml(htmlNode);
892: }
893:
894: private void writeToStream(OutputStream out, String charset,
895: int method) throws IOException {
896: BufferedWriter writer = new BufferedWriter(
897: new OutputStreamWriter(out, charset));
898: writeXml(writer, method);
899: }
900:
901: private void writeToStream(OutputStream out, int method)
902: throws IOException {
903: BufferedWriter writer = new BufferedWriter(
904: new OutputStreamWriter(out));
905: writeXml(writer, method);
906: }
907:
908: public void writeXmlToStream(OutputStream out) throws IOException {
909: writeToStream(out, WRITE_METHOD_SIMPLE);
910: }
911:
912: public void writeXmlToStream(OutputStream out, String charset)
913: throws IOException {
914: writeToStream(out, charset, WRITE_METHOD_SIMPLE);
915: }
916:
917: public void writeCompactXmlToStream(OutputStream out)
918: throws IOException {
919: writeToStream(out, WRITE_METHOD_COMPACT);
920: }
921:
922: public void writeCompactXmlToStream(OutputStream out, String charset)
923: throws IOException {
924: writeToStream(out, charset, WRITE_METHOD_COMPACT);
925: }
926:
927: public void writePrettyXmlToStream(OutputStream out)
928: throws IOException {
929: writeToStream(out, WRITE_METHOD_PRETTY);
930: }
931:
932: public void writePrettyXmlToStream(OutputStream out, String charset)
933: throws IOException {
934: writeToStream(out, charset, WRITE_METHOD_PRETTY);
935: }
936:
937: private void writeToFile(String fileName, String charset, int method)
938: throws IOException {
939: writeToStream(new FileOutputStream(fileName), charset, method);
940: }
941:
942: private void writeToFile(String fileName, int method)
943: throws IOException {
944: writeToStream(new FileOutputStream(fileName), method);
945: }
946:
947: public void writeXmlToFile(String fileName) throws IOException {
948: writeToFile(fileName, WRITE_METHOD_SIMPLE);
949: }
950:
951: public void writeXmlToFile(String fileName, String charset)
952: throws IOException {
953: writeToFile(fileName, charset, WRITE_METHOD_SIMPLE);
954: }
955:
956: public void writeCompactXmlToFile(String fileName)
957: throws IOException {
958: writeToFile(fileName, WRITE_METHOD_COMPACT);
959: }
960:
961: public void writeCompactXmlToFile(String fileName, String charset)
962: throws IOException {
963: writeToFile(fileName, charset, WRITE_METHOD_COMPACT);
964: }
965:
966: public void writePrettyXmlToFile(String fileName)
967: throws IOException {
968: writeToFile(fileName, WRITE_METHOD_PRETTY);
969: }
970:
971: public void writePrettyXmlToFile(String fileName, String charset)
972: throws IOException {
973: writeToFile(fileName, charset, WRITE_METHOD_PRETTY);
974: }
975:
976: public String getXmlAsString() throws IOException {
977: StringWriter writer = new StringWriter();
978: writeXml(writer, WRITE_METHOD_SIMPLE);
979:
980: return writer.getBuffer().toString();
981: }
982:
983: public String getCompactXmlAsString() throws IOException {
984: StringWriter writer = new StringWriter();
985: writeXml(writer, WRITE_METHOD_COMPACT);
986:
987: return writer.getBuffer().toString();
988: }
989:
990: public String getPrettyXmlAsString() throws IOException {
991: StringWriter writer = new StringWriter();
992: writeXml(writer, WRITE_METHOD_PRETTY);
993:
994: return writer.getBuffer().toString();
995: }
996:
997: }
|