001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of HtmlCleaner may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035: subject line.
036: */
037:
038: package org.htmlcleaner;
039:
040: import org.w3c.dom.Document;
041:
042: import javax.xml.parsers.ParserConfigurationException;
043: import java.io.*;
044: import java.net.URL;
045: import java.util.*;
046:
047: /**
048: * Main HtmlCleaner class.
049: *
050: * <p>It represents public interface to the user. It's task is to call tokenizer with
051: * specified source HTML, traverse list of produced token list and create internal
052: * object model. It also offers a set of methods to write resulting XML to string,
053: * file or any output stream.</p>
054: * <p>Typical usage is the following:</p>
055: *
056: * <xmp>
057: * HtmlCleaner cleaner = new HtmlCleaner(...); // one of few constructors
058: * cleaner.setXXX(...) // optionally, set cleaner's behaviour
059: * clener.clean(); // calls cleaning process
060: * cleaner.writeXmlXXX(...); // writes resulting XML to string, file or any output stream
061: * // cleaner.createDOM(); // writes resulting XML to string, file or any output stream
062: * </xmp>
063: *
064: * Created by: Vladimir Nikic <br/>
065: * Date: November, 2006
066: */
067: public class HtmlCleaner {
068:
069: public static final String DEFAULT_CHARSET = System
070: .getProperty("file.encoding");
071:
072: private static final int WRITE_METHOD_SIMPLE = 0;
073: private static final int WRITE_METHOD_COMPACT = 1;
074: private static final int WRITE_METHOD_PRETTY = 2;
075:
076: /**
077: * Contains information about single open tag
078: */
079: private class TagPos {
080: private int position;
081: private String name;
082: private TagInfo info;
083:
084: TagPos(int position, String name) {
085: this .position = position;
086: this .name = name;
087: this .info = tagInfoProvider.getTagInfo(name);
088: }
089: }
090:
091: /**
092: * Class that contains information and mathods for managing list of open,
093: * but unhandled tags.
094: */
095: private class OpenTags {
096: private List list = new ArrayList();
097: private TagPos last = null;
098: private Set set = new HashSet();
099:
100: private boolean isEmpty() {
101: return list.isEmpty();
102: }
103:
104: private void addTag(String tagName, int position) {
105: last = new TagPos(position, tagName);
106: list.add(last);
107: set.add(tagName);
108: }
109:
110: private void removeTag(String tagName) {
111: ListIterator it = list.listIterator(list.size());
112: while (it.hasPrevious()) {
113: TagPos currTagPos = (TagPos) it.previous();
114: if (tagName.equals(currTagPos.name)) {
115: it.remove();
116: break;
117: }
118: }
119:
120: last = list.isEmpty() ? null : (TagPos) list.get(list
121: .size() - 1);
122: }
123:
124: private TagPos findFirstTagPos() {
125: return list.isEmpty() ? null : (TagPos) list.get(0);
126: }
127:
128: private TagPos getLastTagPos() {
129: return last;
130: }
131:
132: private TagPos findTag(String tagName) {
133: if (tagName != null) {
134: ListIterator it = list.listIterator(list.size());
135: while (it.hasPrevious()) {
136: TagPos currTagPos = (TagPos) it.previous();
137: if (tagName.equals(currTagPos.name)) {
138: return currTagPos;
139: }
140: }
141: }
142:
143: return null;
144: }
145:
146: private boolean tagExists(String tagName) {
147: TagPos tagPos = findTag(tagName);
148: return tagPos != null;
149: }
150:
151: private TagPos findTagToPlaceRubbish() {
152: TagPos result = null, prev = null;
153:
154: if (!isEmpty()) {
155: ListIterator it = list.listIterator(list.size());
156: while (it.hasPrevious()) {
157: result = (TagPos) it.previous();
158: if (result.info == null
159: || result.info.allowsAnything()) {
160: if (prev != null) {
161: return prev;
162: }
163: }
164: prev = result;
165: }
166: }
167:
168: return result;
169: }
170:
171: private boolean tagEncountered(String tagName) {
172: return set.contains(tagName);
173: }
174:
175: /**
176: * Checks if any of tags specified in the set are already open.
177: * @param tags
178: */
179: private boolean someAlreadyOpen(Set tags) {
180: Iterator it = list.iterator();
181: while (it.hasNext()) {
182: TagPos curr = (TagPos) it.next();
183: if (tags.contains(curr.name)) {
184: return true;
185: }
186: }
187:
188: return false;
189: }
190: }
191:
192: private ITagInfoProvider tagInfoProvider;
193:
194: private Reader reader;
195: private transient OpenTags _openTags = new OpenTags();
196: private transient DoctypeToken _docType = null;
197: private Set allTags = new TreeSet();
198:
199: private boolean advancedXmlEscape = true;
200: private boolean useCdataForScriptAndStyle = true;
201: private boolean translateSpecialEntities = true;
202: private boolean recognizeUnicodeChars = true;
203: private boolean omitUnknownTags = false;
204: private boolean omitDeprecatedTags = false;
205: private boolean omitComments = false;
206: private boolean omitXmlDeclaration = false;
207: private boolean omitDoctypeDeclaration = true;
208: private boolean omitXmlnsAttributes = false;
209: private String hyphenReplacementInComment = "=";
210:
211: private TagNode htmlNode;
212: private TagNode bodyNode;
213: private TagNode headNode;
214: private TagNode styleNode;
215:
216: /**
217: * Constructor - creates the instance with specified html
218: * content as String.
219: * @param htmlContent
220: */
221: public HtmlCleaner(String htmlContent,
222: ITagInfoProvider tagInfoProvider) {
223: this .reader = new StringReader(htmlContent);
224: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
225: .getInstance()
226: : tagInfoProvider;
227: }
228:
229: /**
230: * Constructor - creates the instance with specified html
231: * content as String.
232: * @param htmlContent
233: */
234: public HtmlCleaner(String htmlContent) {
235: this (htmlContent, HtmlTagProvider.getInstance());
236: }
237:
238: /**
239: * Constructor - creates the instance for specified file.
240: * @param file
241: * @param charset
242: * @throws IOException
243: */
244: public HtmlCleaner(File file, String charset,
245: ITagInfoProvider tagInfoProvider) throws IOException {
246: FileInputStream in = new FileInputStream(file);
247: this .reader = new InputStreamReader(in, charset);
248: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
249: .getInstance()
250: : tagInfoProvider;
251: }
252:
253: /**
254: * Constructor - creates the instance for specified file.
255: * @param file
256: * @param charset
257: * @throws IOException
258: */
259: public HtmlCleaner(File file, String charset) throws IOException {
260: this (file, charset, HtmlTagProvider.getInstance());
261: }
262:
263: /**
264: * Constructor - creates the instance for specified file and charset.
265: * @param file
266: * @throws IOException
267: */
268: public HtmlCleaner(File file, ITagInfoProvider tagInfoProvider)
269: throws IOException {
270: this (file, DEFAULT_CHARSET, tagInfoProvider);
271: }
272:
273: /**
274: * Constructor - creates the instance for specified file and charset.
275: * @param file
276: * @throws IOException
277: */
278: public HtmlCleaner(File file) throws IOException {
279: this (file, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
280: }
281:
282: /**
283: * Constructor - creates the instance for specified URL and charset.
284: * @param url
285: * @param charset
286: * @throws IOException
287: */
288: public HtmlCleaner(URL url, String charset,
289: ITagInfoProvider tagInfoProvider) throws IOException {
290: StringBuffer content = Utils.readUrl(url, charset);
291: this .reader = new StringReader(content.toString());
292: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
293: .getInstance()
294: : tagInfoProvider;
295: }
296:
297: /**
298: * Constructor - creates the instance for specified URL and charset.
299: * @param url
300: * @param tagInfoProvider
301: * @throws IOException
302: */
303: public HtmlCleaner(URL url, ITagInfoProvider tagInfoProvider)
304: throws IOException {
305: this (url, DEFAULT_CHARSET, tagInfoProvider);
306: }
307:
308: /**
309: * Constructor - creates the instance for specified URL and charset.
310: * @param url
311: * @param charset
312: * @throws IOException
313: */
314: public HtmlCleaner(URL url, String charset) throws IOException {
315: this (url, charset, HtmlTagProvider.getInstance());
316: }
317:
318: /**
319: * Constructor - creates the instance for specified URL and charset.
320: * @param url
321: * @throws IOException
322: */
323: public HtmlCleaner(URL url) throws IOException {
324: this (url, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
325: }
326:
327: /**
328: * Constructor - creates the instance for the specified inpout stream
329: * @param in
330: * @param tagInfoProvider
331: */
332: public HtmlCleaner(InputStream in, ITagInfoProvider tagInfoProvider) {
333: this .reader = new InputStreamReader(in);
334: this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
335: .getInstance()
336: : tagInfoProvider;
337: }
338:
339: /**
340: * Constructor - creates the instance for the specified inpout stream
341: * @param in
342: */
343: public HtmlCleaner(InputStream in) {
344: this (in, HtmlTagProvider.getInstance());
345: }
346:
347: DoctypeToken getDoctype() {
348: return _docType;
349: }
350:
351: void setDoctype(DoctypeToken type) {
352: _docType = type;
353: }
354:
355: /**
356: * Constructor - creates the instance for the specified inpout stream
357: * and the charset
358: * @param in
359: * @param charset
360: * @throws IOException
361: */
362: public HtmlCleaner(InputStream in, String charset)
363: throws IOException {
364: reader = new InputStreamReader(in, charset);
365: }
366:
367: public void clean(boolean isTextPlain, boolean addStyleSheet)
368: throws IOException {
369: allTags.clear();
370:
371: htmlNode = new TagNode("html");
372: bodyNode = new TagNode("body");
373: headNode = new TagNode("head");
374: if (addStyleSheet) {
375: styleNode = new TagNode("link");
376: styleNode.addAttribute("href", "../css/preview.css");
377: styleNode.addAttribute("rel", "stylesheet");
378: styleNode.addAttribute("type", "text/css");
379: headNode.addChild(styleNode);
380: }
381: htmlNode.addChild(headNode);
382: htmlNode.addChild(bodyNode);
383:
384: HtmlTokenizer htmlTokenizer = new HtmlTokenizer(this );
385:
386: htmlTokenizer.start(isTextPlain);
387:
388: List nodeList = htmlTokenizer.getTokenList();
389: closeAll(nodeList);
390: createDocumentNodes(nodeList);
391: }
392:
393: Reader getReader() {
394: return reader;
395: }
396:
397: /**
398: * Add attributes from specified map to the specified tag.
399: * If some attribute already exist it is preserved.
400: * @param tag
401: * @param attributes
402: */
403: private void addAttributesToTag(TagNode tag, Map attributes) {
404: if (attributes != null) {
405: Map tagAttributes = tag.getAttributes();
406: Iterator it = attributes.entrySet().iterator();
407: while (it.hasNext()) {
408: Map.Entry currEntry = (Map.Entry) it.next();
409: String attName = (String) currEntry.getKey();
410: if (!tagAttributes.containsKey(attName)) {
411: String attValue = (String) currEntry.getValue();
412: tag.addAttribute(attName, attValue);
413: }
414: }
415: }
416: }
417:
418: /**
419: * Checks if open fatal tag is missing if there is a fatal tag for
420: * the specified tag.
421: * @param tag
422: */
423: private boolean isFatalTagSatisfied(TagInfo tag) {
424: if (tag != null) {
425: String fatalTagName = tag.getFatalTag();
426: return fatalTagName == null ? true : _openTags
427: .tagExists(fatalTagName);
428: }
429:
430: return true;
431: }
432:
433: /**
434: * Check if specified tag requires parent tag, but that parent
435: * tag is missing in the appropriate context.
436: * @param tag
437: */
438: private boolean mustAddRequiredParent(TagInfo tag) {
439: if (tag != null) {
440: String requiredParent = tag.getRequiredParent();
441: if (requiredParent != null) {
442: String fatalTag = tag.getFatalTag();
443: int fatalTagPositon = -1;
444: if (fatalTag != null) {
445: TagPos tagPos = _openTags.findTag(fatalTag);
446: if (tagPos != null) {
447: fatalTagPositon = tagPos.position;
448: }
449: }
450:
451: // iterates through the list of open tags from the end and check if there is some higher
452: ListIterator it = _openTags.list
453: .listIterator(_openTags.list.size());
454: while (it.hasPrevious()) {
455: TagPos currTagPos = (TagPos) it.previous();
456: if (tag.isHigher(currTagPos.name)) {
457: return currTagPos.position <= fatalTagPositon;
458: }
459: }
460:
461: return true;
462: }
463: }
464:
465: return false;
466: }
467:
468: private TagNode createTagNode(TagNode startTagToken) {
469: startTagToken.setFormed();
470: return startTagToken;
471: }
472:
473: private boolean isAllowedInLastOpenTag(BaseToken token) {
474: TagPos last = _openTags.getLastTagPos();
475: if (last != null) {
476: if (last.info != null) {
477: return last.info.allowsItem(token);
478: }
479: }
480:
481: return true;
482: }
483:
484: private void saveToLastOpenTag(List nodeList, Object tokenToAdd) {
485: TagPos last = _openTags.getLastTagPos();
486: if (last != null && last.info != null
487: && last.info.isIgnorePermitted()) {
488: return;
489: }
490:
491: TagPos rubbishPos = _openTags.findTagToPlaceRubbish();
492: if (rubbishPos != null) {
493: TagNode startTagToken = (TagNode) nodeList
494: .get(rubbishPos.position);
495: startTagToken.addItemForMoving(tokenToAdd);
496: }
497: }
498:
499: private boolean isStartToken(Object o) {
500: return (o instanceof TagNode) && !((TagNode) o).isFormed();
501: }
502:
503: void makeTree(List nodeList, ListIterator nodeIterator) {
504: // process while not reach the end of the list
505: while (nodeIterator.hasNext()) {
506: BaseToken token = (BaseToken) nodeIterator.next();
507:
508: if (token instanceof EndTagToken) {
509: EndTagToken endTagToken = (EndTagToken) token;
510: String tagName = endTagToken.getName();
511: TagInfo tag = tagInfoProvider.getTagInfo(tagName);
512:
513: if ((tag == null && omitUnknownTags)
514: || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
515: nodeIterator.set(null);
516: } else if (tag != null && !tag.allowsBody()) {
517: nodeIterator.set(null);
518: } else {
519: TagPos matchingPosition = _openTags
520: .findTag(tagName);
521:
522: if (matchingPosition != null) {
523: closeSnippet(nodeList, matchingPosition,
524: endTagToken);
525: } else if (!isAllowedInLastOpenTag(token)) {
526: saveToLastOpenTag(nodeList, token);
527: }
528:
529: nodeIterator.set(null);
530: }
531: } else if (isStartToken(token)) {
532: TagNode startTagToken = (TagNode) token;
533: String tagName = startTagToken.getName();
534: TagInfo tag = tagInfoProvider.getTagInfo(tagName);
535:
536: // add tag to set of all tags
537: allTags.add(tagName);
538:
539: // HTML open tag
540: if ("html".equals(tagName)) {
541: addAttributesToTag(htmlNode, startTagToken
542: .getAttributes());
543: nodeIterator.set(null);
544: // BODY open tag
545: } else if ("body".equals(tagName)) {
546: addAttributesToTag(bodyNode, startTagToken
547: .getAttributes());
548: nodeIterator.set(null);
549: // HEAD open tag
550: } else if ("head".equals(tagName)) {
551: addAttributesToTag(headNode, startTagToken
552: .getAttributes());
553: nodeIterator.set(null);
554: // unknows HTML tag and unknown tags are not allowed
555: } else if ((tag == null && omitUnknownTags)
556: || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
557: nodeIterator.set(null);
558: } else if (tag != null
559: && tag.hasPermittedTags()
560: && _openTags.someAlreadyOpen(tag
561: .getPermittedTags())) {
562: nodeIterator.set(null);
563: // if tag that must be unique, ignore this occurence
564: } else if (tag != null && tag.isUnique()
565: && _openTags.tagEncountered(tagName)) {
566: nodeIterator.set(null);
567: // if there is no required outer tag without that this open tag is ignored
568: } else if (!isFatalTagSatisfied(tag)) {
569: nodeIterator.set(null);
570: // if there is no required parent tag - it must be added before this open tag
571: } else if (mustAddRequiredParent(tag)) {
572: String requiredParent = tag.getRequiredParent();
573: TagNode requiredParentStartToken = new TagNode(
574: requiredParent);
575: nodeIterator.previous();
576: nodeIterator.add(requiredParentStartToken);
577: nodeIterator.previous();
578: // if last open tag has lower presidence then this, it must be closed
579: } else if (tag != null
580: && !_openTags.isEmpty()
581: && tag
582: .isMustCloseTag(tagInfoProvider
583: .getTagInfo(_openTags
584: .getLastTagPos().name))) {
585: List closed = closeSnippet(nodeList, _openTags
586: .getLastTagPos(), startTagToken);
587: int closedCount = closed.size();
588:
589: // it is needed to copy some tags again in front of current, if there are any
590: if (tag.hasCopyTags() && closedCount > 0) {
591: // first iterates over list from the back and collects all start tokens
592: // in sequence that must be copied
593: ListIterator closedIt = closed
594: .listIterator(closedCount);
595: List toBeCopied = new ArrayList();
596: while (closedIt.hasPrevious()) {
597: TagNode currStartToken = (TagNode) closedIt
598: .previous();
599: if (tag.isCopy(currStartToken.getName())) {
600: toBeCopied.add(0, currStartToken);
601: } else {
602: break;
603: }
604: }
605:
606: if (toBeCopied.size() > 0) {
607: Iterator copyIt = toBeCopied.iterator();
608: while (copyIt.hasNext()) {
609: TagNode currStartToken = (TagNode) copyIt
610: .next();
611: nodeIterator.add(currStartToken
612: .makeCopy());
613: }
614:
615: // back to the previous place, before adding new start tokens
616: for (int i = 0; i < toBeCopied.size(); i++) {
617: nodeIterator.previous();
618: }
619: }
620: }
621:
622: nodeIterator.previous();
623: // if this open tag is not allowed inside last open tag, then it must be moved to the place where it can be
624: } else if (!isAllowedInLastOpenTag(token)) {
625: saveToLastOpenTag(nodeList, token);
626: nodeIterator.set(null);
627: // if it is known HTML tag but doesn't allow body, it is immidiately closed
628: } else if (tag != null && !tag.allowsBody()) {
629: TagNode newTagNode = createTagNode(startTagToken);
630: if (tag.isHeadTag()) {
631: headNode.addChild(newTagNode);
632: nodeIterator.set(null);
633: } else {
634: nodeIterator.set(newTagNode);
635: }
636: // default case - just remember this open tag and go further
637: } else {
638: _openTags.addTag(tagName, nodeIterator
639: .previousIndex());
640: }
641: } else {
642: if (!isAllowedInLastOpenTag(token)) {
643: saveToLastOpenTag(nodeList, token);
644: nodeIterator.set(null);
645: }
646: }
647: }
648: }
649:
650: private void createDocumentNodes(List listNodes) {
651: Iterator it = listNodes.iterator();
652: while (it.hasNext()) {
653: Object child = it.next();
654:
655: if (child == null) {
656: continue;
657: }
658:
659: TagNode parent = bodyNode;
660: boolean toAdd = true;
661:
662: if (child instanceof TagNode) {
663: TagInfo tag = tagInfoProvider
664: .getTagInfo(((TagNode) child).getName());
665: if (tag != null) {
666: if (tag.isHeadTag()
667: || (tag.isHeadAndBodyTag() && bodyNode
668: .getChildren().isEmpty())) {
669: parent = headNode;
670: }
671: }
672: } else {
673: if (child instanceof ContentToken) {
674: toAdd = !"".equals(((ContentToken) child)
675: .toString());
676: }
677: }
678:
679: if (toAdd) {
680: parent.addChild(child);
681: }
682: }
683: }
684:
685: private List closeSnippet(List nodeList, TagPos tagPos,
686: Object toNode) {
687: List closed = new ArrayList();
688: ListIterator it = nodeList.listIterator(tagPos.position);
689:
690: TagNode tagNode = null;
691: Object item = it.next();
692: boolean isListEnd = false;
693:
694: while ((toNode == null && !isListEnd)
695: || (toNode != null && item != toNode)) {
696: if (isStartToken(item)) {
697: TagNode startTagToken = (TagNode) item;
698: closed.add(startTagToken);
699: List itemsToMove = startTagToken.getItemsToMove();
700: if (itemsToMove != null) {
701: OpenTags prevOpenTags = _openTags;
702: _openTags = new OpenTags();
703: makeTree(itemsToMove, itemsToMove.listIterator(0));
704: closeAll(itemsToMove);
705: startTagToken.setItemsToMove(null);
706: _openTags = prevOpenTags;
707: }
708:
709: TagNode newTagNode = createTagNode(startTagToken);
710:
711: TagInfo tag = tagInfoProvider.getTagInfo(newTagNode
712: .getName());
713: if (tag != null && tag.isHeadTag()) {
714: headNode.addChild(newTagNode);
715: it.set(null);
716: } else if (tagNode != null) {
717: tagNode.addChildren(itemsToMove);
718: tagNode.addChild(newTagNode);
719: it.set(null);
720: } else {
721: if (itemsToMove != null) {
722: itemsToMove.add(newTagNode);
723: it.set(itemsToMove);
724: } else {
725: it.set(newTagNode);
726: }
727: }
728:
729: _openTags.removeTag(newTagNode.getName());
730: tagNode = newTagNode;
731: } else {
732: if (tagNode != null) {
733: it.set(null);
734: if (item != null) {
735: tagNode.addChild(item);
736: }
737: }
738: }
739:
740: if (it.hasNext()) {
741: item = it.next();
742: } else {
743: isListEnd = true;
744: }
745: }
746:
747: return closed;
748: }
749:
750: /**
751: * Close all unclosed tags if there are any.
752: */
753: private void closeAll(List nodeList) {
754: TagPos firstTagPos = _openTags.findFirstTagPos();
755: if (firstTagPos != null) {
756: closeSnippet(nodeList, firstTagPos, null);
757: }
758: }
759:
760: // setters and getters
761:
762: public boolean isOmitUnknownTags() {
763: return omitUnknownTags;
764: }
765:
766: public void setOmitUnknownTags(boolean omitUnknownTags) {
767: this .omitUnknownTags = omitUnknownTags;
768: }
769:
770: public boolean isOmitDeprecatedTags() {
771: return omitDeprecatedTags;
772: }
773:
774: public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
775: this .omitDeprecatedTags = omitDeprecatedTags;
776: }
777:
778: public boolean isAdvancedXmlEscape() {
779: return advancedXmlEscape;
780: }
781:
782: public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
783: this .advancedXmlEscape = advancedXmlEscape;
784: }
785:
786: public boolean isUseCdataForScriptAndStyle() {
787: return useCdataForScriptAndStyle;
788: }
789:
790: public void setUseCdataForScriptAndStyle(
791: boolean useCdataForScriptAndStyle) {
792: this .useCdataForScriptAndStyle = useCdataForScriptAndStyle;
793: }
794:
795: public boolean isTranslateSpecialEntities() {
796: return translateSpecialEntities;
797: }
798:
799: public void setTranslateSpecialEntities(
800: boolean translateSpecialEntities) {
801: this .translateSpecialEntities = translateSpecialEntities;
802: }
803:
804: public boolean isRecognizeUnicodeChars() {
805: return recognizeUnicodeChars;
806: }
807:
808: public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
809: this .recognizeUnicodeChars = recognizeUnicodeChars;
810: }
811:
812: public boolean isOmitComments() {
813: return omitComments;
814: }
815:
816: public void setOmitComments(boolean omitComments) {
817: this .omitComments = omitComments;
818: }
819:
820: public boolean isOmitXmlDeclaration() {
821: return omitXmlDeclaration;
822: }
823:
824: public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
825: this .omitXmlDeclaration = omitXmlDeclaration;
826: }
827:
828: public boolean isOmitDoctypeDeclaration() {
829: return omitDoctypeDeclaration;
830: }
831:
832: public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
833: this .omitDoctypeDeclaration = omitDoctypeDeclaration;
834: }
835:
836: public boolean isOmitXmlnsAttributes() {
837: return omitXmlnsAttributes;
838: }
839:
840: public void setOmitXmlnsAttributes(boolean omitXmlnsAttributes) {
841: this .omitXmlnsAttributes = omitXmlnsAttributes;
842: }
843:
844: public String getHyphenReplacementInComment() {
845: return hyphenReplacementInComment;
846: }
847:
848: public void setHyphenReplacementInComment(
849: String hyphenReplacementInComment) {
850: this .hyphenReplacementInComment = hyphenReplacementInComment;
851: }
852:
853: public Set getAllTags() {
854: return allTags;
855: }
856:
857: // methods for creating result
858:
859: /**
860: * Creates XML DOM document object.
861: * @return Instance of org.w3c.dom.Document
862: */
863: public Document createDOM() throws ParserConfigurationException {
864: DomSerializer domSerializer = new DomSerializer();
865: return domSerializer.createDOM(htmlNode);
866: }
867:
868: /**
869: * The most general way to serialize resulting XML.
870: * @param xmlSerializer
871: * @throws IOException
872: */
873: public void writeXml(XmlSerializer xmlSerializer)
874: throws IOException {
875: xmlSerializer.createXml(htmlNode);
876: }
877:
878: private void writeXml(Writer writer, int method) throws IOException {
879: XmlSerializer xmlSerializer = null;
880:
881: if (WRITE_METHOD_COMPACT == method) {
882: xmlSerializer = new CompactXmlSerializer(writer, this );
883: } else if (WRITE_METHOD_PRETTY == method) {
884: xmlSerializer = new PrettyXmlSerializer(writer, this );
885: } else {
886: xmlSerializer = new SimpleXmlSerializer(writer, this );
887: }
888:
889: xmlSerializer.createXml(htmlNode);
890: }
891:
892: private void writeToStream(OutputStream out, String charset,
893: int method) throws IOException {
894: BufferedWriter writer = new BufferedWriter(
895: new OutputStreamWriter(out, charset));
896: writeXml(writer, method);
897: }
898:
899: private void writeToStream(OutputStream out, int method)
900: throws IOException {
901: BufferedWriter writer = new BufferedWriter(
902: new OutputStreamWriter(out));
903: writeXml(writer, method);
904: }
905:
906: public void writeXmlToStream(OutputStream out) throws IOException {
907: writeToStream(out, WRITE_METHOD_SIMPLE);
908: }
909:
910: public void writeXmlToStream(OutputStream out, String charset)
911: throws IOException {
912: writeToStream(out, charset, WRITE_METHOD_SIMPLE);
913: }
914:
915: public void writeCompactXmlToStream(OutputStream out)
916: throws IOException {
917: writeToStream(out, WRITE_METHOD_COMPACT);
918: }
919:
920: public void writeCompactXmlToStream(OutputStream out, String charset)
921: throws IOException {
922: writeToStream(out, charset, WRITE_METHOD_COMPACT);
923: }
924:
925: public void writePrettyXmlToStream(OutputStream out)
926: throws IOException {
927: writeToStream(out, WRITE_METHOD_PRETTY);
928: }
929:
930: public void writePrettyXmlToStream(OutputStream out, String charset)
931: throws IOException {
932: writeToStream(out, charset, WRITE_METHOD_PRETTY);
933: }
934:
935: private void writeToFile(String fileName, String charset, int method)
936: throws IOException {
937: writeToStream(new FileOutputStream(fileName), charset, method);
938: }
939:
940: private void writeToFile(String fileName, int method)
941: throws IOException {
942: writeToStream(new FileOutputStream(fileName), method);
943: }
944:
945: public void writeXmlToFile(String fileName) throws IOException {
946: writeToFile(fileName, WRITE_METHOD_SIMPLE);
947: }
948:
949: public void writeXmlToFile(String fileName, String charset)
950: throws IOException {
951: writeToFile(fileName, charset, WRITE_METHOD_SIMPLE);
952: }
953:
954: public void writeCompactXmlToFile(String fileName)
955: throws IOException {
956: writeToFile(fileName, WRITE_METHOD_COMPACT);
957: }
958:
959: public void writeCompactXmlToFile(String fileName, String charset)
960: throws IOException {
961: writeToFile(fileName, charset, WRITE_METHOD_COMPACT);
962: }
963:
964: public void writePrettyXmlToFile(String fileName)
965: throws IOException {
966: writeToFile(fileName, WRITE_METHOD_PRETTY);
967: }
968:
969: public void writePrettyXmlToFile(String fileName, String charset)
970: throws IOException {
971: writeToFile(fileName, charset, WRITE_METHOD_PRETTY);
972: }
973:
974: public String getXmlAsString() throws IOException {
975: StringWriter writer = new StringWriter();
976: writeXml(writer, WRITE_METHOD_SIMPLE);
977:
978: return writer.getBuffer().toString();
979: }
980:
981: public String getCompactXmlAsString() throws IOException {
982: StringWriter writer = new StringWriter();
983: writeXml(writer, WRITE_METHOD_COMPACT);
984:
985: return writer.getBuffer().toString();
986: }
987:
988: public String getPrettyXmlAsString() throws IOException {
989: StringWriter writer = new StringWriter();
990: writeXml(writer, WRITE_METHOD_PRETTY);
991:
992: return writer.getBuffer().toString();
993: }
994:
995: }
|