Source Code Cross Referenced for HtmlCleaner.java in » Chat » claros-intouch » org » htmlcleaner » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Chat » claros intouch » org.htmlcleaner
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*  Copyright (c) 2006-2007, Vladimir Nikic
002:            All rights reserved.
003:        	
004:            Redistribution and use of this software in source and binary forms, 
005:            with or without modification, are permitted provided that the following 
006:            conditions are met:
007:        	
008:         * Redistributions of source code must retain the above
009:              copyright notice, this list of conditions and the
010:              following disclaimer.
011:        	
012:         * Redistributions in binary form must reproduce the above
013:              copyright notice, this list of conditions and the
014:              following disclaimer in the documentation and/or other
015:              materials provided with the distribution.
016:        	
017:         * The name of HtmlCleaner may not be used to endorse or promote 
018:              products derived from this software without specific prior
019:              written permission.
020:
021:            THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
022:            AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
023:            IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
024:            ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
025:            LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
026:            CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
027:            SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
028:            INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
029:            CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
030:            ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
031:            POSSIBILITY OF SUCH DAMAGE.
032:        	
033:            You can contact Vladimir Nikic by sending e-mail to
034:            nikic_vladimir@yahoo.com. Please include the word "HtmlCleaner" in the
035:            subject line.
036:         */
037:
038:        package org.htmlcleaner;
039:
040:        import org.w3c.dom.Document;
041:
042:        import javax.xml.parsers.ParserConfigurationException;
043:        import java.io.*;
044:        import java.net.URL;
045:        import java.util.*;
046:
047:        /**
048:         * Main HtmlCleaner class.
049:         *
050:         * <p>It represents public interface to the user. It's task is to call tokenizer with
051:         * specified source HTML, traverse list of produced token list and create internal
052:         * object model. It also offers a set of methods to write resulting XML to string,
053:         * file or any output stream.</p>
054:         * <p>Typical usage is the following:</p>
055:         *
056:         * <xmp>
057:         *      HtmlCleaner cleaner = new HtmlCleaner(...);     // one of few constructors
058:         *      cleaner.setXXX(...)                             // optionally, set cleaner's behaviour
059:         *      clener.clean();                                 // calls cleaning process
060:         *      cleaner.writeXmlXXX(...);                       // writes resulting XML to string, file or any output stream
061:         *      // cleaner.createDOM();                         // writes resulting XML to string, file or any output stream
062:         * </xmp>
063:         *
064:         * Created by: Vladimir Nikic <br/>
065:         * Date: November, 2006
066:         */
067:        public class HtmlCleaner {
068:
069:            public static final String DEFAULT_CHARSET = System
070:                    .getProperty("file.encoding");
071:
072:            private static final int WRITE_METHOD_SIMPLE = 0;
073:            private static final int WRITE_METHOD_COMPACT = 1;
074:            private static final int WRITE_METHOD_PRETTY = 2;
075:
076:            /**
077:             * Contains information about single open tag
078:             */
079:            private class TagPos {
080:                private int position;
081:                private String name;
082:                private TagInfo info;
083:
084:                TagPos(int position, String name) {
085:                    this .position = position;
086:                    this .name = name;
087:                    this .info = tagInfoProvider.getTagInfo(name);
088:                }
089:            }
090:
091:            /**
092:             * Class that contains information and mathods for managing list of open,
093:             * but unhandled tags.
094:             */
095:            private class OpenTags {
096:                private List list = new ArrayList();
097:                private TagPos last = null;
098:                private Set set = new HashSet();
099:
100:                private boolean isEmpty() {
101:                    return list.isEmpty();
102:                }
103:
104:                private void addTag(String tagName, int position) {
105:                    last = new TagPos(position, tagName);
106:                    list.add(last);
107:                    set.add(tagName);
108:                }
109:
110:                private void removeTag(String tagName) {
111:                    ListIterator it = list.listIterator(list.size());
112:                    while (it.hasPrevious()) {
113:                        TagPos currTagPos = (TagPos) it.previous();
114:                        if (tagName.equals(currTagPos.name)) {
115:                            it.remove();
116:                            break;
117:                        }
118:                    }
119:
120:                    last = list.isEmpty() ? null : (TagPos) list.get(list
121:                            .size() - 1);
122:                }
123:
124:                private TagPos findFirstTagPos() {
125:                    return list.isEmpty() ? null : (TagPos) list.get(0);
126:                }
127:
128:                private TagPos getLastTagPos() {
129:                    return last;
130:                }
131:
132:                private TagPos findTag(String tagName) {
133:                    if (tagName != null) {
134:                        ListIterator it = list.listIterator(list.size());
135:                        while (it.hasPrevious()) {
136:                            TagPos currTagPos = (TagPos) it.previous();
137:                            if (tagName.equals(currTagPos.name)) {
138:                                return currTagPos;
139:                            }
140:                        }
141:                    }
142:
143:                    return null;
144:                }
145:
146:                private boolean tagExists(String tagName) {
147:                    TagPos tagPos = findTag(tagName);
148:                    return tagPos != null;
149:                }
150:
151:                private TagPos findTagToPlaceRubbish() {
152:                    TagPos result = null, prev = null;
153:
154:                    if (!isEmpty()) {
155:                        ListIterator it = list.listIterator(list.size());
156:                        while (it.hasPrevious()) {
157:                            result = (TagPos) it.previous();
158:                            if (result.info == null
159:                                    || result.info.allowsAnything()) {
160:                                if (prev != null) {
161:                                    return prev;
162:                                }
163:                            }
164:                            prev = result;
165:                        }
166:                    }
167:
168:                    return result;
169:                }
170:
171:                private boolean tagEncountered(String tagName) {
172:                    return set.contains(tagName);
173:                }
174:
175:                /**
176:                 * Checks if any of tags specified in the set are already open.
177:                 * @param tags
178:                 */
179:                private boolean someAlreadyOpen(Set tags) {
180:                    Iterator it = list.iterator();
181:                    while (it.hasNext()) {
182:                        TagPos curr = (TagPos) it.next();
183:                        if (tags.contains(curr.name)) {
184:                            return true;
185:                        }
186:                    }
187:
188:                    return false;
189:                }
190:            }
191:
192:            private ITagInfoProvider tagInfoProvider;
193:
194:            private Reader reader;
195:            private transient OpenTags _openTags = new OpenTags();
196:            private transient DoctypeToken _docType = null;
197:            private Set allTags = new TreeSet();
198:
199:            private boolean advancedXmlEscape = true;
200:            private boolean useCdataForScriptAndStyle = true;
201:            private boolean translateSpecialEntities = true;
202:            private boolean recognizeUnicodeChars = true;
203:            private boolean omitUnknownTags = false;
204:            private boolean omitDeprecatedTags = false;
205:            private boolean omitComments = false;
206:            private boolean omitXmlDeclaration = false;
207:            private boolean omitDoctypeDeclaration = true;
208:            private boolean omitXmlnsAttributes = false;
209:            private String hyphenReplacementInComment = "=";
210:
211:            private TagNode htmlNode;
212:            private TagNode bodyNode;
213:            private TagNode headNode;
214:            private TagNode styleNode;
215:
216:            /**
217:             * Constructor - creates the instance with specified html 
218:             * content as String.
219:             * @param htmlContent
220:             */
221:            public HtmlCleaner(String htmlContent,
222:                    ITagInfoProvider tagInfoProvider) {
223:                this .reader = new StringReader(htmlContent);
224:                this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
225:                        .getInstance()
226:                        : tagInfoProvider;
227:            }
228:
229:            /**
230:             * Constructor - creates the instance with specified html
231:             * content as String.
232:             * @param htmlContent
233:             */
234:            public HtmlCleaner(String htmlContent) {
235:                this (htmlContent, HtmlTagProvider.getInstance());
236:            }
237:
238:            /**
239:             * Constructor - creates the instance for specified file.
240:             * @param file
241:             * @param charset
242:             * @throws IOException
243:             */
244:            public HtmlCleaner(File file, String charset,
245:                    ITagInfoProvider tagInfoProvider) throws IOException {
246:                FileInputStream in = new FileInputStream(file);
247:                this .reader = new InputStreamReader(in, charset);
248:                this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
249:                        .getInstance()
250:                        : tagInfoProvider;
251:            }
252:
253:            /**
254:             * Constructor - creates the instance for specified file.
255:             * @param file
256:             * @param charset
257:             * @throws IOException
258:             */
259:            public HtmlCleaner(File file, String charset) throws IOException {
260:                this (file, charset, HtmlTagProvider.getInstance());
261:            }
262:
263:            /**
264:             * Constructor - creates the instance for specified file and charset.
265:             * @param file
266:             * @throws IOException
267:             */
268:            public HtmlCleaner(File file, ITagInfoProvider tagInfoProvider)
269:                    throws IOException {
270:                this (file, DEFAULT_CHARSET, tagInfoProvider);
271:            }
272:
273:            /**
274:             * Constructor - creates the instance for specified file and charset.
275:             * @param file
276:             * @throws IOException
277:             */
278:            public HtmlCleaner(File file) throws IOException {
279:                this (file, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
280:            }
281:
282:            /**
283:             * Constructor - creates the instance for specified URL and charset.
284:             * @param url
285:             * @param charset
286:             * @throws IOException 
287:             */
288:            public HtmlCleaner(URL url, String charset,
289:                    ITagInfoProvider tagInfoProvider) throws IOException {
290:                StringBuffer content = Utils.readUrl(url, charset);
291:                this .reader = new StringReader(content.toString());
292:                this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
293:                        .getInstance()
294:                        : tagInfoProvider;
295:            }
296:
297:            /**
298:             * Constructor - creates the instance for specified URL and charset.
299:             * @param url
300:             * @param tagInfoProvider
301:             * @throws IOException
302:             */
303:            public HtmlCleaner(URL url, ITagInfoProvider tagInfoProvider)
304:                    throws IOException {
305:                this (url, DEFAULT_CHARSET, tagInfoProvider);
306:            }
307:
308:            /**
309:             * Constructor - creates the instance for specified URL and charset.
310:             * @param url
311:             * @param charset
312:             * @throws IOException
313:             */
314:            public HtmlCleaner(URL url, String charset) throws IOException {
315:                this (url, charset, HtmlTagProvider.getInstance());
316:            }
317:
318:            /**
319:             * Constructor - creates the instance for specified URL and charset.
320:             * @param url
321:             * @throws IOException
322:             */
323:            public HtmlCleaner(URL url) throws IOException {
324:                this (url, DEFAULT_CHARSET, HtmlTagProvider.getInstance());
325:            }
326:
327:            /**
328:             * Constructor - creates the instance for the specified inpout stream
329:             * @param in
330:             * @param tagInfoProvider
331:             */
332:            public HtmlCleaner(InputStream in, ITagInfoProvider tagInfoProvider) {
333:                this .reader = new InputStreamReader(in);
334:                this .tagInfoProvider = tagInfoProvider == null ? HtmlTagProvider
335:                        .getInstance()
336:                        : tagInfoProvider;
337:            }
338:
339:            /**
340:             * Constructor - creates the instance for the specified inpout stream
341:             * @param in
342:             */
343:            public HtmlCleaner(InputStream in) {
344:                this (in, HtmlTagProvider.getInstance());
345:            }
346:
347:            DoctypeToken getDoctype() {
348:                return _docType;
349:            }
350:
351:            void setDoctype(DoctypeToken type) {
352:                _docType = type;
353:            }
354:
355:            /**
356:             * Constructor - creates the instance for the specified inpout stream
357:             * and the charset
358:             * @param in
359:             * @param charset
360:             * @throws IOException
361:             */
362:            public HtmlCleaner(InputStream in, String charset)
363:                    throws IOException {
364:                reader = new InputStreamReader(in, charset);
365:            }
366:
367:            public void clean(boolean isTextPlain, boolean addStyleSheet)
368:                    throws IOException {
369:                allTags.clear();
370:
371:                htmlNode = new TagNode("html");
372:                bodyNode = new TagNode("body");
373:                headNode = new TagNode("head");
374:                if (addStyleSheet) {
375:                    if (isTextPlain) {
376:                        styleNode = new TagNode("link");
377:                        styleNode.addAttribute("href", "../css/preview.css");
378:                        styleNode.addAttribute("rel", "stylesheet");
379:                        styleNode.addAttribute("type", "text/css");
380:                        headNode.addChild(styleNode);
381:                    }
382:                }
383:                htmlNode.addChild(headNode);
384:                htmlNode.addChild(bodyNode);
385:
386:                HtmlTokenizer htmlTokenizer = new HtmlTokenizer(this );
387:
388:                htmlTokenizer.start(isTextPlain);
389:
390:                List nodeList = htmlTokenizer.getTokenList();
391:                closeAll(nodeList);
392:                createDocumentNodes(nodeList);
393:            }
394:
395:            Reader getReader() {
396:                return reader;
397:            }
398:
399:            /**
400:             * Add attributes from specified map to the specified tag.
401:             * If some attribute already exist it is preserved.
402:             * @param tag
403:             * @param attributes
404:             */
405:            private void addAttributesToTag(TagNode tag, Map attributes) {
406:                if (attributes != null) {
407:                    Map tagAttributes = tag.getAttributes();
408:                    Iterator it = attributes.entrySet().iterator();
409:                    while (it.hasNext()) {
410:                        Map.Entry currEntry = (Map.Entry) it.next();
411:                        String attName = (String) currEntry.getKey();
412:                        if (!tagAttributes.containsKey(attName)) {
413:                            String attValue = (String) currEntry.getValue();
414:                            tag.addAttribute(attName, attValue);
415:                        }
416:                    }
417:                }
418:            }
419:
420:            /**
421:             * Checks if open fatal tag is missing if there is a fatal tag for
422:             * the specified tag.
423:             * @param tag
424:             */
425:            private boolean isFatalTagSatisfied(TagInfo tag) {
426:                if (tag != null) {
427:                    String fatalTagName = tag.getFatalTag();
428:                    return fatalTagName == null ? true : _openTags
429:                            .tagExists(fatalTagName);
430:                }
431:
432:                return true;
433:            }
434:
435:            /**
436:             * Check if specified tag requires parent tag, but that parent
437:             * tag is missing in the appropriate context.
438:             * @param tag
439:             */
440:            private boolean mustAddRequiredParent(TagInfo tag) {
441:                if (tag != null) {
442:                    String requiredParent = tag.getRequiredParent();
443:                    if (requiredParent != null) {
444:                        String fatalTag = tag.getFatalTag();
445:                        int fatalTagPositon = -1;
446:                        if (fatalTag != null) {
447:                            TagPos tagPos = _openTags.findTag(fatalTag);
448:                            if (tagPos != null) {
449:                                fatalTagPositon = tagPos.position;
450:                            }
451:                        }
452:
453:                        // iterates through the list of open tags from the end and check if there is some higher
454:                        ListIterator it = _openTags.list
455:                                .listIterator(_openTags.list.size());
456:                        while (it.hasPrevious()) {
457:                            TagPos currTagPos = (TagPos) it.previous();
458:                            if (tag.isHigher(currTagPos.name)) {
459:                                return currTagPos.position <= fatalTagPositon;
460:                            }
461:                        }
462:
463:                        return true;
464:                    }
465:                }
466:
467:                return false;
468:            }
469:
470:            private TagNode createTagNode(TagNode startTagToken) {
471:                startTagToken.setFormed();
472:                return startTagToken;
473:            }
474:
475:            private boolean isAllowedInLastOpenTag(BaseToken token) {
476:                TagPos last = _openTags.getLastTagPos();
477:                if (last != null) {
478:                    if (last.info != null) {
479:                        return last.info.allowsItem(token);
480:                    }
481:                }
482:
483:                return true;
484:            }
485:
486:            private void saveToLastOpenTag(List nodeList, Object tokenToAdd) {
487:                TagPos last = _openTags.getLastTagPos();
488:                if (last != null && last.info != null
489:                        && last.info.isIgnorePermitted()) {
490:                    return;
491:                }
492:
493:                TagPos rubbishPos = _openTags.findTagToPlaceRubbish();
494:                if (rubbishPos != null) {
495:                    TagNode startTagToken = (TagNode) nodeList
496:                            .get(rubbishPos.position);
497:                    startTagToken.addItemForMoving(tokenToAdd);
498:                }
499:            }
500:
501:            private boolean isStartToken(Object o) {
502:                return (o instanceof  TagNode) && !((TagNode) o).isFormed();
503:            }
504:
505:            void makeTree(List nodeList, ListIterator nodeIterator) {
506:                // process while not reach the end of the list
507:                while (nodeIterator.hasNext()) {
508:                    BaseToken token = (BaseToken) nodeIterator.next();
509:
510:                    if (token instanceof  EndTagToken) {
511:                        EndTagToken endTagToken = (EndTagToken) token;
512:                        String tagName = endTagToken.getName();
513:                        TagInfo tag = tagInfoProvider.getTagInfo(tagName);
514:
515:                        if ((tag == null && omitUnknownTags)
516:                                || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
517:                            nodeIterator.set(null);
518:                        } else if (tag != null && !tag.allowsBody()) {
519:                            nodeIterator.set(null);
520:                        } else {
521:                            TagPos matchingPosition = _openTags
522:                                    .findTag(tagName);
523:
524:                            if (matchingPosition != null) {
525:                                closeSnippet(nodeList, matchingPosition,
526:                                        endTagToken);
527:                            } else if (!isAllowedInLastOpenTag(token)) {
528:                                saveToLastOpenTag(nodeList, token);
529:                            }
530:
531:                            nodeIterator.set(null);
532:                        }
533:                    } else if (isStartToken(token)) {
534:                        TagNode startTagToken = (TagNode) token;
535:                        String tagName = startTagToken.getName();
536:                        TagInfo tag = tagInfoProvider.getTagInfo(tagName);
537:
538:                        // add tag to set of all tags
539:                        allTags.add(tagName);
540:
541:                        // HTML open tag
542:                        if ("html".equals(tagName)) {
543:                            addAttributesToTag(htmlNode, startTagToken
544:                                    .getAttributes());
545:                            nodeIterator.set(null);
546:                            // BODY open tag
547:                        } else if ("body".equals(tagName)) {
548:                            addAttributesToTag(bodyNode, startTagToken
549:                                    .getAttributes());
550:                            nodeIterator.set(null);
551:                            // HEAD open tag
552:                        } else if ("head".equals(tagName)) {
553:                            addAttributesToTag(headNode, startTagToken
554:                                    .getAttributes());
555:                            nodeIterator.set(null);
556:                            // unknows HTML tag and unknown tags are not allowed
557:                        } else if ((tag == null && omitUnknownTags)
558:                                || (tag != null && tag.isDeprecated() && omitDeprecatedTags)) {
559:                            nodeIterator.set(null);
560:                        } else if (tag != null
561:                                && tag.hasPermittedTags()
562:                                && _openTags.someAlreadyOpen(tag
563:                                        .getPermittedTags())) {
564:                            nodeIterator.set(null);
565:                            // if tag that must be unique, ignore this occurence
566:                        } else if (tag != null && tag.isUnique()
567:                                && _openTags.tagEncountered(tagName)) {
568:                            nodeIterator.set(null);
569:                            // if there is no required outer tag without that this open tag is ignored
570:                        } else if (!isFatalTagSatisfied(tag)) {
571:                            nodeIterator.set(null);
572:                            // if there is no required parent tag - it must be added before this open tag
573:                        } else if (mustAddRequiredParent(tag)) {
574:                            String requiredParent = tag.getRequiredParent();
575:                            TagNode requiredParentStartToken = new TagNode(
576:                                    requiredParent);
577:                            nodeIterator.previous();
578:                            nodeIterator.add(requiredParentStartToken);
579:                            nodeIterator.previous();
580:                            // if last open tag has lower presidence then this, it must be closed
581:                        } else if (tag != null
582:                                && !_openTags.isEmpty()
583:                                && tag
584:                                        .isMustCloseTag(tagInfoProvider
585:                                                .getTagInfo(_openTags
586:                                                        .getLastTagPos().name))) {
587:                            List closed = closeSnippet(nodeList, _openTags
588:                                    .getLastTagPos(), startTagToken);
589:                            int closedCount = closed.size();
590:
591:                            // it is needed to copy some tags again in front of current, if there are any
592:                            if (tag.hasCopyTags() && closedCount > 0) {
593:                                // first iterates over list from the back and collects all start tokens
594:                                // in sequence that must be copied
595:                                ListIterator closedIt = closed
596:                                        .listIterator(closedCount);
597:                                List toBeCopied = new ArrayList();
598:                                while (closedIt.hasPrevious()) {
599:                                    TagNode currStartToken = (TagNode) closedIt
600:                                            .previous();
601:                                    if (tag.isCopy(currStartToken.getName())) {
602:                                        toBeCopied.add(0, currStartToken);
603:                                    } else {
604:                                        break;
605:                                    }
606:                                }
607:
608:                                if (toBeCopied.size() > 0) {
609:                                    Iterator copyIt = toBeCopied.iterator();
610:                                    while (copyIt.hasNext()) {
611:                                        TagNode currStartToken = (TagNode) copyIt
612:                                                .next();
613:                                        nodeIterator.add(currStartToken
614:                                                .makeCopy());
615:                                    }
616:
617:                                    // back to the previous place, before adding new start tokens
618:                                    for (int i = 0; i < toBeCopied.size(); i++) {
619:                                        nodeIterator.previous();
620:                                    }
621:                                }
622:                            }
623:
624:                            nodeIterator.previous();
625:                            // if this open tag is not allowed inside last open tag, then it must be moved to the place where it can be
626:                        } else if (!isAllowedInLastOpenTag(token)) {
627:                            saveToLastOpenTag(nodeList, token);
628:                            nodeIterator.set(null);
629:                            // if it is known HTML tag but doesn't allow body, it is immidiately closed
630:                        } else if (tag != null && !tag.allowsBody()) {
631:                            TagNode newTagNode = createTagNode(startTagToken);
632:                            if (tag.isHeadTag()) {
633:                                headNode.addChild(newTagNode);
634:                                nodeIterator.set(null);
635:                            } else {
636:                                nodeIterator.set(newTagNode);
637:                            }
638:                            // default case - just remember this open tag and go further
639:                        } else {
640:                            _openTags.addTag(tagName, nodeIterator
641:                                    .previousIndex());
642:                        }
643:                    } else {
644:                        if (!isAllowedInLastOpenTag(token)) {
645:                            saveToLastOpenTag(nodeList, token);
646:                            nodeIterator.set(null);
647:                        }
648:                    }
649:                }
650:            }
651:
652:            private void createDocumentNodes(List listNodes) {
653:                Iterator it = listNodes.iterator();
654:                while (it.hasNext()) {
655:                    Object child = it.next();
656:
657:                    if (child == null) {
658:                        continue;
659:                    }
660:
661:                    TagNode parent = bodyNode;
662:                    boolean toAdd = true;
663:
664:                    if (child instanceof  TagNode) {
665:                        TagInfo tag = tagInfoProvider
666:                                .getTagInfo(((TagNode) child).getName());
667:                        if (tag != null) {
668:                            if (tag.isHeadTag()
669:                                    || (tag.isHeadAndBodyTag() && bodyNode
670:                                            .getChildren().isEmpty())) {
671:                                parent = headNode;
672:                            }
673:                        }
674:                    } else {
675:                        if (child instanceof  ContentToken) {
676:                            toAdd = !"".equals(((ContentToken) child)
677:                                    .toString());
678:                        }
679:                    }
680:
681:                    if (toAdd) {
682:                        parent.addChild(child);
683:                    }
684:                }
685:            }
686:
687:            private List closeSnippet(List nodeList, TagPos tagPos,
688:                    Object toNode) {
689:                List closed = new ArrayList();
690:                ListIterator it = nodeList.listIterator(tagPos.position);
691:
692:                TagNode tagNode = null;
693:                Object item = it.next();
694:                boolean isListEnd = false;
695:
696:                while ((toNode == null && !isListEnd)
697:                        || (toNode != null && item != toNode)) {
698:                    if (isStartToken(item)) {
699:                        TagNode startTagToken = (TagNode) item;
700:                        closed.add(startTagToken);
701:                        List itemsToMove = startTagToken.getItemsToMove();
702:                        if (itemsToMove != null) {
703:                            OpenTags prevOpenTags = _openTags;
704:                            _openTags = new OpenTags();
705:                            makeTree(itemsToMove, itemsToMove.listIterator(0));
706:                            closeAll(itemsToMove);
707:                            startTagToken.setItemsToMove(null);
708:                            _openTags = prevOpenTags;
709:                        }
710:
711:                        TagNode newTagNode = createTagNode(startTagToken);
712:
713:                        TagInfo tag = tagInfoProvider.getTagInfo(newTagNode
714:                                .getName());
715:                        if (tag != null && tag.isHeadTag()) {
716:                            headNode.addChild(newTagNode);
717:                            it.set(null);
718:                        } else if (tagNode != null) {
719:                            tagNode.addChildren(itemsToMove);
720:                            tagNode.addChild(newTagNode);
721:                            it.set(null);
722:                        } else {
723:                            if (itemsToMove != null) {
724:                                itemsToMove.add(newTagNode);
725:                                it.set(itemsToMove);
726:                            } else {
727:                                it.set(newTagNode);
728:                            }
729:                        }
730:
731:                        _openTags.removeTag(newTagNode.getName());
732:                        tagNode = newTagNode;
733:                    } else {
734:                        if (tagNode != null) {
735:                            it.set(null);
736:                            if (item != null) {
737:                                tagNode.addChild(item);
738:                            }
739:                        }
740:                    }
741:
742:                    if (it.hasNext()) {
743:                        item = it.next();
744:                    } else {
745:                        isListEnd = true;
746:                    }
747:                }
748:
749:                return closed;
750:            }
751:
752:            /**
753:             * Close all unclosed tags if there are any.
754:             */
755:            private void closeAll(List nodeList) {
756:                TagPos firstTagPos = _openTags.findFirstTagPos();
757:                if (firstTagPos != null) {
758:                    closeSnippet(nodeList, firstTagPos, null);
759:                }
760:            }
761:
762:            // setters and getters
763:
764:            public boolean isOmitUnknownTags() {
765:                return omitUnknownTags;
766:            }
767:
768:            public void setOmitUnknownTags(boolean omitUnknownTags) {
769:                this .omitUnknownTags = omitUnknownTags;
770:            }
771:
772:            public boolean isOmitDeprecatedTags() {
773:                return omitDeprecatedTags;
774:            }
775:
776:            public void setOmitDeprecatedTags(boolean omitDeprecatedTags) {
777:                this .omitDeprecatedTags = omitDeprecatedTags;
778:            }
779:
780:            public boolean isAdvancedXmlEscape() {
781:                return advancedXmlEscape;
782:            }
783:
784:            public void setAdvancedXmlEscape(boolean advancedXmlEscape) {
785:                this .advancedXmlEscape = advancedXmlEscape;
786:            }
787:
788:            public boolean isUseCdataForScriptAndStyle() {
789:                return useCdataForScriptAndStyle;
790:            }
791:
792:            public void setUseCdataForScriptAndStyle(
793:                    boolean useCdataForScriptAndStyle) {
794:                this .useCdataForScriptAndStyle = useCdataForScriptAndStyle;
795:            }
796:
797:            public boolean isTranslateSpecialEntities() {
798:                return translateSpecialEntities;
799:            }
800:
801:            public void setTranslateSpecialEntities(
802:                    boolean translateSpecialEntities) {
803:                this .translateSpecialEntities = translateSpecialEntities;
804:            }
805:
806:            public boolean isRecognizeUnicodeChars() {
807:                return recognizeUnicodeChars;
808:            }
809:
810:            public void setRecognizeUnicodeChars(boolean recognizeUnicodeChars) {
811:                this .recognizeUnicodeChars = recognizeUnicodeChars;
812:            }
813:
814:            public boolean isOmitComments() {
815:                return omitComments;
816:            }
817:
818:            public void setOmitComments(boolean omitComments) {
819:                this .omitComments = omitComments;
820:            }
821:
822:            public boolean isOmitXmlDeclaration() {
823:                return omitXmlDeclaration;
824:            }
825:
826:            public void setOmitXmlDeclaration(boolean omitXmlDeclaration) {
827:                this .omitXmlDeclaration = omitXmlDeclaration;
828:            }
829:
830:            public boolean isOmitDoctypeDeclaration() {
831:                return omitDoctypeDeclaration;
832:            }
833:
834:            public void setOmitDoctypeDeclaration(boolean omitDoctypeDeclaration) {
835:                this .omitDoctypeDeclaration = omitDoctypeDeclaration;
836:            }
837:
838:            public boolean isOmitXmlnsAttributes() {
839:                return omitXmlnsAttributes;
840:            }
841:
842:            public void setOmitXmlnsAttributes(boolean omitXmlnsAttributes) {
843:                this .omitXmlnsAttributes = omitXmlnsAttributes;
844:            }
845:
846:            public String getHyphenReplacementInComment() {
847:                return hyphenReplacementInComment;
848:            }
849:
850:            public void setHyphenReplacementInComment(
851:                    String hyphenReplacementInComment) {
852:                this .hyphenReplacementInComment = hyphenReplacementInComment;
853:            }
854:
855:            public Set getAllTags() {
856:                return allTags;
857:            }
858:
859:            // methods for creating result
860:
861:            /**
862:             * Creates XML DOM document object.
863:             * @return Instance of org.w3c.dom.Document
864:             */
865:            public Document createDOM() throws ParserConfigurationException {
866:                DomSerializer domSerializer = new DomSerializer();
867:                return domSerializer.createDOM(htmlNode);
868:            }
869:
870:            /**
871:             * The most general way to serialize resulting XML.
872:             * @param xmlSerializer
873:             * @throws IOException
874:             */
875:            public void writeXml(XmlSerializer xmlSerializer)
876:                    throws IOException {
877:                xmlSerializer.createXml(htmlNode);
878:            }
879:
880:            private void writeXml(Writer writer, int method) throws IOException {
881:                XmlSerializer xmlSerializer = null;
882:
883:                if (WRITE_METHOD_COMPACT == method) {
884:                    xmlSerializer = new CompactXmlSerializer(writer, this );
885:                } else if (WRITE_METHOD_PRETTY == method) {
886:                    xmlSerializer = new PrettyXmlSerializer(writer, this );
887:                } else {
888:                    xmlSerializer = new SimpleXmlSerializer(writer, this );
889:                }
890:
891:                xmlSerializer.createXml(htmlNode);
892:            }
893:
894:            private void writeToStream(OutputStream out, String charset,
895:                    int method) throws IOException {
896:                BufferedWriter writer = new BufferedWriter(
897:                        new OutputStreamWriter(out, charset));
898:                writeXml(writer, method);
899:            }
900:
901:            private void writeToStream(OutputStream out, int method)
902:                    throws IOException {
903:                BufferedWriter writer = new BufferedWriter(
904:                        new OutputStreamWriter(out));
905:                writeXml(writer, method);
906:            }
907:
908:            public void writeXmlToStream(OutputStream out) throws IOException {
909:                writeToStream(out, WRITE_METHOD_SIMPLE);
910:            }
911:
912:            public void writeXmlToStream(OutputStream out, String charset)
913:                    throws IOException {
914:                writeToStream(out, charset, WRITE_METHOD_SIMPLE);
915:            }
916:
917:            public void writeCompactXmlToStream(OutputStream out)
918:                    throws IOException {
919:                writeToStream(out, WRITE_METHOD_COMPACT);
920:            }
921:
922:            public void writeCompactXmlToStream(OutputStream out, String charset)
923:                    throws IOException {
924:                writeToStream(out, charset, WRITE_METHOD_COMPACT);
925:            }
926:
927:            public void writePrettyXmlToStream(OutputStream out)
928:                    throws IOException {
929:                writeToStream(out, WRITE_METHOD_PRETTY);
930:            }
931:
932:            public void writePrettyXmlToStream(OutputStream out, String charset)
933:                    throws IOException {
934:                writeToStream(out, charset, WRITE_METHOD_PRETTY);
935:            }
936:
937:            private void writeToFile(String fileName, String charset, int method)
938:                    throws IOException {
939:                writeToStream(new FileOutputStream(fileName), charset, method);
940:            }
941:
942:            private void writeToFile(String fileName, int method)
943:                    throws IOException {
944:                writeToStream(new FileOutputStream(fileName), method);
945:            }
946:
947:            public void writeXmlToFile(String fileName) throws IOException {
948:                writeToFile(fileName, WRITE_METHOD_SIMPLE);
949:            }
950:
951:            public void writeXmlToFile(String fileName, String charset)
952:                    throws IOException {
953:                writeToFile(fileName, charset, WRITE_METHOD_SIMPLE);
954:            }
955:
956:            public void writeCompactXmlToFile(String fileName)
957:                    throws IOException {
958:                writeToFile(fileName, WRITE_METHOD_COMPACT);
959:            }
960:
961:            public void writeCompactXmlToFile(String fileName, String charset)
962:                    throws IOException {
963:                writeToFile(fileName, charset, WRITE_METHOD_COMPACT);
964:            }
965:
966:            public void writePrettyXmlToFile(String fileName)
967:                    throws IOException {
968:                writeToFile(fileName, WRITE_METHOD_PRETTY);
969:            }
970:
971:            public void writePrettyXmlToFile(String fileName, String charset)
972:                    throws IOException {
973:                writeToFile(fileName, charset, WRITE_METHOD_PRETTY);
974:            }
975:
976:            public String getXmlAsString() throws IOException {
977:                StringWriter writer = new StringWriter();
978:                writeXml(writer, WRITE_METHOD_SIMPLE);
979:
980:                return writer.getBuffer().toString();
981:            }
982:
983:            public String getCompactXmlAsString() throws IOException {
984:                StringWriter writer = new StringWriter();
985:                writeXml(writer, WRITE_METHOD_COMPACT);
986:
987:                return writer.getBuffer().toString();
988:            }
989:
990:            public String getPrettyXmlAsString() throws IOException {
991:                StringWriter writer = new StringWriter();
992:                writeXml(writer, WRITE_METHOD_PRETTY);
993:
994:                return writer.getBuffer().toString();
995:            }
996:
997:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.