Source Code Cross Referenced for Purifier.java in » HTML-Parser » nekohtml » org » cyberneko » html » filters » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » HTML Parser » nekohtml » org.cyberneko.html.filters

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* 
002:         * Copyright 2004-2008 Andy Clark
003:         * 
004:         * Licensed under the Apache License, Version 2.0 (the "License");
005:         * you may not use this file except in compliance with the License.
006:         * You may obtain a copy of the License at
007:         *
008:         *     http://www.apache.org/licenses/LICENSE-2.0
009:         *
010:         * Unless required by applicable law or agreed to in writing, software
011:         * distributed under the License is distributed on an "AS IS" BASIS,
012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013:         * See the License for the specific language governing permissions and
014:         * limitations under the License.
015:         */
016:
017:        package org.cyberneko.html.filters;
018:
019:        import org.apache.xerces.util.XMLChar;
020:        import org.apache.xerces.util.XMLStringBuffer;
021:        import org.apache.xerces.xni.Augmentations;
022:        import org.apache.xerces.xni.NamespaceContext;
023:        import org.apache.xerces.xni.QName;
024:        import org.apache.xerces.xni.XMLAttributes;
025:        import org.apache.xerces.xni.XMLLocator;
026:        import org.apache.xerces.xni.XMLString;
027:        import org.apache.xerces.xni.XNIException;
028:        import org.apache.xerces.xni.parser.XMLComponentManager;
029:        import org.apache.xerces.xni.parser.XMLConfigurationException;
030:        import org.cyberneko.html.HTMLAugmentations;
031:        import org.cyberneko.html.HTMLEventInfo;
032:
033:        /**
034:         * This filter purifies the HTML input to ensure XML well-formedness.
035:         * The purification process includes:
036:         * <ul>
037:         * <li>fixing illegal characters in the document, including
038:         *  <ul>
039:         *  <li>element and attribute names,
040:         *  <li>processing instruction target and data,
041:         *  <li>document text;
042:         *  </ul>
043:         * <li>ensuring the string "--" does not appear in the content of
044:         *     a comment;
045:         * <li>ensuring the string "]]>" does not appear in the content of
046:         *     a CDATA section; 
047:         * <li>ensuring that the XML declaration has required pseudo-attributes
048:         *     and that the values are correct;
049:         * and
050:         * <li>synthesized missing namespace bindings.
051:         * </ul>
052:         * <p>
053:         * Illegal characters in XML names are converted to the character 
054:         * sequence "_u####_" where "####" is the value of the Unicode 
055:         * character represented in hexadecimal. Whereas illegal characters
056:         * appearing in document content is converted to the character
057:         * sequence "\\u####".
058: * <p>
059: * In comments, the character '-' is replaced by the character
060: * sequence "- " to prevent "--" from ever appearing in the comment
061: * content. For CDATA sections, the character ']' is replaced by
062: * the character sequence "] " to prevent "]]" from appearing.
063: * <p>
064: * The URI used for synthesized namespace bindings is
065: * "http://cyberneko.org/html/ns/synthesized/<i>number</i>" where
066: * <i>number</i> is generated to ensure uniqueness.
067: * 
068: * @author Andy Clark
069: * 
070: * @version $Id: Purifier.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
071: */
072:        public class Purifier extends DefaultFilter {
073:
074:            //
075:            // Constants
076:            //
077:
078:            /** Synthesized namespace binding prefix. */
079:            public static final String SYNTHESIZED_NAMESPACE_PREFX = "http://cyberneko.org/html/ns/synthesized/";
080:
081:            /** Namespaces. */
082:            protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
083:
084:            /** Include infoset augmentations. */
085:            protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
086:
087:            /** Recognized features. */
088:            private static final String[] RECOGNIZED_FEATURES = { NAMESPACES,
089:                    AUGMENTATIONS, };
090:
091:            // static vars
092:
093:            /** Synthesized event info item. */
094:            protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
095:
096:            //
097:            // Data
098:            //
099:
100:            // features
101:
102:            /** Namespaces. */
103:            protected boolean fNamespaces;
104:
105:            /** Augmentations. */
106:            protected boolean fAugmentations;
107:
108:            // state
109:
110:            /** True if the doctype declaration was seen. */
111:            protected boolean fSeenDoctype;
112:
113:            /** True if root element was seen. */
114:            protected boolean fSeenRootElement;
115:
116:            /** True if inside a CDATA section. */
117:            protected boolean fInCDATASection;
118:
119:            // doctype declaration info
120:
121:            /** Public identifier of doctype declaration. */
122:            protected String fPublicId;
123:
124:            /** System identifier of doctype declaration. */
125:            protected String fSystemId;
126:
127:            // namespace info
128:
129:            /** Namespace information. */
130:            protected NamespaceContext fNamespaceContext;
131:
132:            /** Synthesized namespace binding count. */
133:            protected int fSynthesizedNamespaceCount;
134:
135:            // temp vars
136:
137:            /** Qualified name. */
138:            private QName fQName = new QName();
139:
140:            /** Augmentations. */
141:            private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
142:
143:            /** String buffer. */
144:            private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
145:
146:            //
147:            // XMLComponent methods
148:            //
149:
150:            public void reset(XMLComponentManager manager)
151:                    throws XMLConfigurationException {
152:
153:                // state
154:                fInCDATASection = false;
155:
156:                // features
157:                fNamespaces = manager.getFeature(NAMESPACES);
158:                fAugmentations = manager.getFeature(AUGMENTATIONS);
159:
160:            } // reset(XMLComponentManager)
161:
162:            //
163:            // XMLDocumentHandler methods
164:            //
165:
166:            /** Start document. */
167:            public void startDocument(XMLLocator locator, String encoding,
168:                    Augmentations augs) throws XNIException {
169:                fNamespaceContext = fNamespaces ? new NamespaceBinder.NamespaceSupport()
170:                        : null;
171:                fSynthesizedNamespaceCount = 0;
172:                handleStartDocument();
173:                super .startDocument(locator, encoding, augs);
174:            } // startDocument(XMLLocator,String,Augmentations)
175:
176:            /** Start document. */
177:            public void startDocument(XMLLocator locator, String encoding,
178:                    NamespaceContext nscontext, Augmentations augs)
179:                    throws XNIException {
180:                fNamespaceContext = nscontext;
181:                fSynthesizedNamespaceCount = 0;
182:                handleStartDocument();
183:                super .startDocument(locator, encoding, nscontext, augs);
184:            } // startDocument(XMLLocator,NamespaceContext,String,Augmentations)
185:
186:            /** XML declaration. */
187:            public void xmlDecl(String version, String encoding,
188:                    String standalone, Augmentations augs) throws XNIException {
189:                if (version == null || !version.equals("1.0")) {
190:                    version = "1.0";
191:                }
192:                if (encoding != null && encoding.length() == 0) {
193:                    encoding = null;
194:                }
195:                if (standalone != null) {
196:                    if (!standalone.equalsIgnoreCase("true")
197:                            && !standalone.equalsIgnoreCase("false")) {
198:                        standalone = null;
199:                    } else {
200:                        standalone = standalone.toLowerCase();
201:                    }
202:                }
203:                super .xmlDecl(version, encoding, standalone, augs);
204:            } // xmlDecl(String,String,String,Augmentations)
205:
206:            /** Comment. */
207:            public void comment(XMLString text, Augmentations augs)
208:                    throws XNIException {
209:                StringBuffer str = new StringBuffer(purifyText(text).toString());
210:                int length = str.length();
211:                for (int i = length - 1; i >= 0; i--) {
212:                    char c = str.charAt(i);
213:                    if (c == '-') {
214:                        str.insert(i + 1, ' ');
215:                    }
216:                }
217:                fStringBuffer.length = 0;
218:                fStringBuffer.append(str.toString());
219:                text = fStringBuffer;
220:                super .comment(text, augs);
221:            } // comment(XMLString,Augmentations)
222:
223:            /** Processing instruction. */
224:            public void processingInstruction(String target, XMLString data,
225:                    Augmentations augs) throws XNIException {
226:                target = purifyName(target, true);
227:                data = purifyText(data);
228:                super .processingInstruction(target, data, augs);
229:            } // processingInstruction(String,XMLString,Augmentations)
230:
231:            /** Doctype declaration. */
232:            public void doctypeDecl(String root, String pubid, String sysid,
233:                    Augmentations augs) throws XNIException {
234:                fSeenDoctype = true;
235:                // NOTE: It doesn't matter what the root element name is because
236:                //       it must match the root element. -Ac
237:                fPublicId = pubid;
238:                fSystemId = sysid;
239:                // NOTE: If the public identifier is specified, then a system
240:                //       identifier must also be specified. -Ac
241:                if (fPublicId != null && fSystemId == null) {
242:                    fSystemId = "";
243:                }
244:                // NOTE: Can't save the augmentations because the object state
245:                //       is transient. -Ac
246:            } // doctypeDecl(String,String,String,Augmentations)
247:
248:            /** Start element. */
249:            public void startElement(QName element, XMLAttributes attrs,
250:                    Augmentations augs) throws XNIException {
251:                handleStartElement(element, attrs);
252:                super .startElement(element, attrs, augs);
253:            } // startElement(QName,XMLAttributes,Augmentations)
254:
255:            /** Empty element. */
256:            public void emptyElement(QName element, XMLAttributes attrs,
257:                    Augmentations augs) throws XNIException {
258:                handleStartElement(element, attrs);
259:                super .emptyElement(element, attrs, augs);
260:            } // emptyElement(QName,XMLAttributes,Augmentations)
261:
262:            /** Start CDATA section. */
263:            public void startCDATA(Augmentations augs) throws XNIException {
264:                fInCDATASection = true;
265:                super .startCDATA(augs);
266:            } // startCDATA(Augmentations)
267:
268:            /** End CDATA section. */
269:            public void endCDATA(Augmentations augs) throws XNIException {
270:                fInCDATASection = false;
271:                super .endCDATA(augs);
272:            } // endCDATA(Augmentations)
273:
274:            /** Characters. */
275:            public void characters(XMLString text, Augmentations augs)
276:                    throws XNIException {
277:                text = purifyText(text);
278:                if (fInCDATASection) {
279:                    StringBuffer str = new StringBuffer(text.toString());
280:                    int length = str.length();
281:                    for (int i = length - 1; i >= 0; i--) {
282:                        char c = str.charAt(i);
283:                        if (c == ']') {
284:                            str.insert(i + 1, ' ');
285:                        }
286:                    }
287:                    fStringBuffer.length = 0;
288:                    fStringBuffer.append(str.toString());
289:                    text = fStringBuffer;
290:                }
291:                super .characters(text, augs);
292:            } // characters(XMLString,Augmentations)
293:
294:            /** End element. */
295:            public void endElement(QName element, Augmentations augs)
296:                    throws XNIException {
297:                element = purifyQName(element);
298:                if (fNamespaces) {
299:                    if (element.prefix != null && element.uri == null) {
300:                        element.uri = fNamespaceContext.getURI(element.prefix);
301:                    }
302:                }
303:                super .endElement(element, augs);
304:            } // endElement(QName,Augmentations)
305:
306:            //
307:            // Protected methods
308:            //
309:
310:            /** Handle start document. */
311:            protected void handleStartDocument() {
312:                fSeenDoctype = false;
313:                fSeenRootElement = false;
314:            } // handleStartDocument()
315:
316:            /** Handle start element. */
317:            protected void handleStartElement(QName element, XMLAttributes attrs) {
318:
319:                // handle element and attributes
320:                element = purifyQName(element);
321:                int attrCount = attrs != null ? attrs.getLength() : 0;
322:                for (int i = attrCount - 1; i >= 0; i--) {
323:                    // purify attribute name
324:                    attrs.getName(i, fQName);
325:                    attrs.setName(i, purifyQName(fQName));
326:
327:                    // synthesize namespace bindings
328:                    if (fNamespaces) {
329:                        if (!fQName.rawname.equals("xmlns")
330:                                && !fQName.rawname.startsWith("xmlns:")) {
331:                            // NOTE: Must get attribute name again because the
332:                            //       purifyQName method does not guarantee that
333:                            //       the same QName object is returned. -Ac
334:                            attrs.getName(i, fQName);
335:                            if (fQName.prefix != null && fQName.uri == null) {
336:                                synthesizeBinding(attrs, fQName.prefix);
337:                            }
338:                        }
339:                    }
340:                }
341:
342:                // synthesize namespace bindings
343:                if (fNamespaces) {
344:                    if (element.prefix != null && element.uri == null) {
345:                        synthesizeBinding(attrs, element.prefix);
346:                    }
347:                }
348:
349:                // synthesize doctype declaration
350:                if (!fSeenRootElement && fSeenDoctype) {
351:                    Augmentations augs = synthesizedAugs();
352:                    super .doctypeDecl(element.rawname, fPublicId, fSystemId,
353:                            augs);
354:                }
355:
356:                // mark start element as seen
357:                fSeenRootElement = true;
358:
359:            } // handleStartElement(QName,XMLAttributes)
360:
361:            /** Synthesize namespace binding. */
362:            protected void synthesizeBinding(XMLAttributes attrs, String ns) {
363:                String prefix = "xmlns";
364:                String localpart = ns;
365:                String qname = prefix + ':' + localpart;
366:                String uri = NamespaceBinder.NAMESPACES_URI;
367:                String atype = "CDATA";
368:                String avalue = SYNTHESIZED_NAMESPACE_PREFX
369:                        + fSynthesizedNamespaceCount++;
370:
371:                // add attribute
372:                fQName.setValues(prefix, localpart, qname, uri);
373:                attrs.addAttribute(fQName, atype, avalue);
374:
375:                // bind namespace
376:                fNamespaceContext.declarePrefix(ns, avalue);
377:
378:            } // synthesizeBinding(XMLAttributes,String)
379:
380:            /** Returns an augmentations object with a synthesized item added. */
381:            protected final Augmentations synthesizedAugs() {
382:                HTMLAugmentations augs = null;
383:                if (fAugmentations) {
384:                    augs = fInfosetAugs;
385:                    augs.removeAllItems();
386:                    augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
387:                }
388:                return augs;
389:            } // synthesizedAugs():Augmentations
390:
391:            //
392:            // Protected methods
393:            //
394:
395:            /** Purify qualified name. */
396:            protected QName purifyQName(QName qname) {
397:                qname.prefix = purifyName(qname.prefix, true);
398:                qname.localpart = purifyName(qname.localpart, true);
399:                qname.rawname = purifyName(qname.rawname, false);
400:                return qname;
401:            } // purifyQName(QName):QName
402:
403:            /** Purify name. */
404:            protected String purifyName(String name, boolean localpart) {
405:                if (name == null) {
406:                    return name;
407:                }
408:                StringBuffer str = new StringBuffer();
409:                int length = name.length();
410:                boolean seenColon = localpart;
411:                for (int i = 0; i < length; i++) {
412:                    char c = name.charAt(i);
413:                    if (i == 0) {
414:                        if (!XMLChar.isNameStart(c)) {
415:                            str.append("_u" + toHexString(c, 4) + "_");
416:                        } else {
417:                            str.append(c);
418:                        }
419:                    } else {
420:                        if ((fNamespaces && c == ':' && seenColon)
421:                                || !XMLChar.isName(c)) {
422:                            str.append("_u" + toHexString(c, 4) + "_");
423:                        } else {
424:                            str.append(c);
425:                        }
426:                        seenColon = seenColon || c == ':';
427:                    }
428:                }
429:                return str.toString();
430:            } // purifyName(String):String
431:
432:            /** Purify content. */
433:            protected XMLString purifyText(XMLString text) {
434:                fStringBuffer.length = 0;
435:                for (int i = 0; i < text.length; i++) {
436:                    char c = text.ch[text.offset + i];
437:                    if (XMLChar.isInvalid(c)) {
438:                        fStringBuffer.append("\\u" + toHexString(c, 4));
439:                    } else {
440:                        fStringBuffer.append(c);
441:                    }
442:                }
443:                return fStringBuffer;
444:            } // purifyText(XMLString):XMLString
445:
446:            //
447:            // Protected static methods
448:            //
449:
450:            /** Returns a padded hexadecimal string for the given value. */
451:            protected static String toHexString(int c, int padlen) {
452:                StringBuffer str = new StringBuffer(padlen);
453:                str.append(Integer.toHexString(c));
454:                int len = padlen - str.length();
455:                for (int i = 0; i < len; i++) {
456:                    str.insert(0, '0');
457:                }
458:                return str.toString().toUpperCase();
459:            } // toHexString(int,int):String
460:
461:        } // class Purifier

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.