Source Code Cross Referenced for Writer.java in » HTML-Parser » nekohtml » org » cyberneko » html » filters » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » HTML Parser » nekohtml » org.cyberneko.html.filters
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* 
002:         * Copyright 2002-2008 Andy Clark
003:         * 
004:         * Licensed under the Apache License, Version 2.0 (the "License");
005:         * you may not use this file except in compliance with the License.
006:         * You may obtain a copy of the License at
007:         *
008:         *     http://www.apache.org/licenses/LICENSE-2.0
009:         *
010:         * Unless required by applicable law or agreed to in writing, software
011:         * distributed under the License is distributed on an "AS IS" BASIS,
012:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013:         * See the License for the specific language governing permissions and
014:         * limitations under the License.
015:         */
016:
017:        package org.cyberneko.html.filters;
018:
019:        import java.io.OutputStream;
020:        import java.io.OutputStreamWriter;
021:        import java.io.PrintWriter;
022:        import java.io.UnsupportedEncodingException;
023:
024:        import org.cyberneko.html.HTMLConfiguration;
025:        import org.cyberneko.html.HTMLElements;
026:        import org.cyberneko.html.HTMLEntities;
027:        import org.cyberneko.html.filters.DefaultFilter;
028:
029:        import org.apache.xerces.xni.Augmentations;
030:        import org.apache.xerces.xni.NamespaceContext;
031:        import org.apache.xerces.xni.QName;
032:        import org.apache.xerces.xni.XMLAttributes;
033:        import org.apache.xerces.xni.XMLLocator;
034:        import org.apache.xerces.xni.XMLResourceIdentifier;
035:        import org.apache.xerces.xni.XMLString;
036:        import org.apache.xerces.xni.XNIException;
037:        import org.apache.xerces.xni.parser.XMLDocumentFilter;
038:        import org.apache.xerces.xni.parser.XMLInputSource;
039:        import org.apache.xerces.xni.parser.XMLParserConfiguration;
040:
041:        /**
042:         * An HTML writer written as a filter. Besides serializing the HTML
043:         * event stream, the writer also passes the document events to the next
044:         * stage in the pipeline. This allows applications to insert writer
045:         * filters between other custom filters for debugging purposes.
046:         * <p>
047:         * Since an HTML document may have specified its encoding using the
048:         * &lt;META&gt; tag and http-equiv/content attributes, the writer will
049:         * automatically change any character set specified in this tag to
050:         * match the encoding of the output stream. Therefore, the character
051:         * encoding name used to construct the writer should be an official
052:         * <a href='http://www.iana.org/assignments/character-sets'>IANA</a>
053:         * encoding name and not a Java encoding name.
054:         * <p>
055:         * <strong>Note:</strong>
056:         * The modified character set in the &lt;META&gt; tag is <em>not</em>
057:         * propagated to the next stage in the pipeline. The changed value is
058:         * only output to the stream; the original value is sent to the next
059:         * stage in the pipeline.
060:         *
061:         * @author Andy Clark
062:         *
063:         * @version $Id: Writer.java,v 1.7 2005/02/14 04:01:33 andyc Exp $
064:         */
065:        public class Writer extends DefaultFilter {
066:
067:            //
068:            // Constants
069:            //
070:
071:            /** Notify character entity references. */
072:            public static final String NOTIFY_CHAR_REFS = "http://apache.org/xml/features/scanner/notify-char-refs";
073:
074:            /** Notify built-in entity references. */
075:            public static final String NOTIFY_HTML_BUILTIN_REFS = "http://cyberneko.org/html/features/scanner/notify-builtin-refs";
076:
077:            /** Augmentations feature identifier. */
078:            protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
079:
080:            /** Filters property identifier. */
081:            protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
082:
083:            //
084:            // Data
085:            //
086:
087:            /** The encoding. */
088:            protected String fEncoding;
089:
090:            /** 
091:             * The print writer used for serializing the document with the
092:             * appropriate character encoding. 
093:             */
094:            protected PrintWriter fPrinter;
095:
096:            // state
097:
098:            /** Seen root element. */
099:            protected boolean fSeenRootElement;
100:
101:            /** Seen http-equiv directive. */
102:            protected boolean fSeenHttpEquiv;
103:
104:            /** Element depth. */
105:            protected int fElementDepth;
106:
107:            /** Normalize character content. */
108:            protected boolean fNormalize;
109:
110:            /** Print characters. */
111:            protected boolean fPrintChars;
112:
113:            //
114:            // Constructors
115:            //
116:
117:            /** Constructs a writer filter that prints to standard out. */
118:            public Writer() {
119:                // Note: UTF-8 should *always* be a supported encoding. Although,
120:                //       I've heard of the old M$ JVM not supporting it! Amazing. -Ac
121:                try {
122:                    fEncoding = "UTF-8";
123:                    fPrinter = new PrintWriter(new OutputStreamWriter(
124:                            System.out, fEncoding));
125:                } catch (UnsupportedEncodingException e) {
126:                    throw new RuntimeException(e.getMessage());
127:                }
128:            } // <init>()
129:
130:            /**
131:             * Constructs a writer filter using the specified output stream and
132:             * encoding.
133:             *
134:             * @param outputStream The output stream to write to.
135:             * @param encoding The encoding to be used for the output. The encoding name
136:             *                 should be an official IANA encoding name.
137:             */
138:            public Writer(OutputStream outputStream, String encoding)
139:                    throws UnsupportedEncodingException {
140:                this (new OutputStreamWriter(outputStream, encoding), encoding);
141:            } // <init>(OutputStream,String)
142:
143:            /**
144:             * Constructs a writer filter using the specified Java writer and
145:             * encoding.
146:             *
147:             * @param writer The Java writer to write to.
148:             * @param encoding The encoding to be used for the output. The encoding name
149:             *                 should be an official IANA encoding name.
150:             */
151:            public Writer(java.io.Writer writer, String encoding) {
152:                fEncoding = encoding;
153:                if (writer instanceof  PrintWriter) {
154:                    fPrinter = (PrintWriter) writer;
155:                } else {
156:                    fPrinter = new PrintWriter(writer);
157:                }
158:            } // <init>(java.io.Writer,String)
159:
160:            //
161:            // XMLDocumentHandler methods
162:            //
163:
164:            // since Xerces-J 2.2.0
165:
166:            /** Start document. */
167:            public void startDocument(XMLLocator locator, String encoding,
168:                    NamespaceContext nscontext, Augmentations augs)
169:                    throws XNIException {
170:                fSeenRootElement = false;
171:                fSeenHttpEquiv = false;
172:                fElementDepth = 0;
173:                fNormalize = true;
174:                fPrintChars = true;
175:                super .startDocument(locator, encoding, nscontext, augs);
176:            } // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
177:
178:            // old methods
179:
180:            /** Start document. */
181:            public void startDocument(XMLLocator locator, String encoding,
182:                    Augmentations augs) throws XNIException {
183:                startDocument(locator, encoding, null, augs);
184:            } // startDocument(XMLLocator,String,Augmentations)
185:
186:            /** Comment. */
187:            public void comment(XMLString text, Augmentations augs)
188:                    throws XNIException {
189:                if (fSeenRootElement && fElementDepth <= 0) {
190:                    fPrinter.println();
191:                }
192:                fPrinter.print("<!--");
193:                printCharacters(text, false);
194:                fPrinter.print("-->");
195:                if (!fSeenRootElement) {
196:                    fPrinter.println();
197:                }
198:                fPrinter.flush();
199:            } // comment(XMLString,Augmentations)
200:
201:            /** Start element. */
202:            public void startElement(QName element, XMLAttributes attributes,
203:                    Augmentations augs) throws XNIException {
204:                fSeenRootElement = true;
205:                fElementDepth++;
206:                fNormalize = !HTMLElements.getElement(element.rawname)
207:                        .isSpecial();
208:                printStartElement(element, attributes);
209:                super .startElement(element, attributes, augs);
210:            } // startElement(QName,XMLAttributes,Augmentations)
211:
212:            /** Empty element. */
213:            public void emptyElement(QName element, XMLAttributes attributes,
214:                    Augmentations augs) throws XNIException {
215:                fSeenRootElement = true;
216:                printStartElement(element, attributes);
217:                super .emptyElement(element, attributes, augs);
218:            } // emptyElement(QName,XMLAttributes,Augmentations)
219:
220:            /** Characters. */
221:            public void characters(XMLString text, Augmentations augs)
222:                    throws XNIException {
223:                if (fPrintChars) {
224:                    printCharacters(text, fNormalize);
225:                }
226:                super .characters(text, augs);
227:            } // characters(XMLString,Augmentations)
228:
229:            /** End element. */
230:            public void endElement(QName element, Augmentations augs)
231:                    throws XNIException {
232:                fElementDepth--;
233:                fNormalize = true;
234:                /***
235:                // NOTE: Not sure if this is what should be done in the case where
236:                //       the encoding is not explitly declared within the HEAD. So
237:                //       I'm leaving it commented out for now. -Ac
238:                if (element.rawname.equalsIgnoreCase("head") && !fSeenHttpEquiv) {
239:                    boolean capitalize = Character.isUpperCase(element.rawname.charAt(0));
240:                    String ename = capitalize ? "META" : "meta";
241:                    QName qname = new QName(null, ename, ename, null);
242:                    XMLAttributes attrs = new XMLAttributesImpl();
243:                    QName aname = new QName(null, "http-equiv", "http-equiv", null);
244:                    attrs.addAttribute(aname, "CDATA", "Content-Type");
245:                    aname.setValues(null, "content", "content", null);
246:                    attrs.addAttribute(aname, "CDATA", "text/html; charset="+fEncoding);
247:                    super.emptyElement(qname, attrs, null);
248:                }
249:                /***/
250:                printEndElement(element);
251:                super .endElement(element, augs);
252:            } // endElement(QName,Augmentations)
253:
254:            /** Start general entity. */
255:            public void startGeneralEntity(String name,
256:                    XMLResourceIdentifier id, String encoding,
257:                    Augmentations augs) throws XNIException {
258:                fPrintChars = false;
259:                if (name.startsWith("#")) {
260:                    try {
261:                        boolean hex = name.startsWith("#x");
262:                        int offset = hex ? 2 : 1;
263:                        int base = hex ? 16 : 10;
264:                        int value = Integer.parseInt(name.substring(offset),
265:                                base);
266:                        String entity = HTMLEntities.get(value);
267:                        if (entity != null) {
268:                            name = entity;
269:                        }
270:                    } catch (NumberFormatException e) {
271:                        // ignore
272:                    }
273:                }
274:                printEntity(name);
275:                super .startGeneralEntity(name, id, encoding, augs);
276:            } // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
277:
278:            /** End general entity. */
279:            public void endGeneralEntity(String name, Augmentations augs)
280:                    throws XNIException {
281:                fPrintChars = true;
282:                super .endGeneralEntity(name, augs);
283:            } // endGeneralEntity(String,Augmentations)
284:
285:            //
286:            // Protected methods
287:            //
288:
289:            /** Print attribute value. */
290:            protected void printAttributeValue(String text) {
291:                int length = text.length();
292:                for (int j = 0; j < length; j++) {
293:                    char c = text.charAt(j);
294:                    if (c == '"') {
295:                        fPrinter.print("&quot;");
296:                    } else {
297:                        fPrinter.print(c);
298:                    }
299:                }
300:                fPrinter.flush();
301:            } // printAttributeValue(String)
302:
303:            /** Print characters. */
304:            protected void printCharacters(XMLString text, boolean normalize) {
305:                if (normalize) {
306:                    for (int i = 0; i < text.length; i++) {
307:                        char c = text.ch[text.offset + i];
308:                        if (c != '\n') {
309:                            String entity = HTMLEntities.get(c);
310:                            if (entity != null) {
311:                                printEntity(entity);
312:                            } else {
313:                                fPrinter.print(c);
314:                            }
315:                        } else {
316:                            fPrinter.println();
317:                        }
318:                    }
319:                } else {
320:                    for (int i = 0; i < text.length; i++) {
321:                        char c = text.ch[text.offset + i];
322:                        fPrinter.print(c);
323:                    }
324:                }
325:                fPrinter.flush();
326:            } // printCharacters(XMLString,boolean)
327:
328:            /** Print start element. */
329:            protected void printStartElement(QName element,
330:                    XMLAttributes attributes) {
331:
332:                // modify META[@http-equiv='content-type']/@content value
333:                int contentIndex = -1;
334:                String originalContent = null;
335:                if (element.rawname.toLowerCase().equals("meta")) {
336:                    String httpEquiv = null;
337:                    int length = attributes.getLength();
338:                    for (int i = 0; i < length; i++) {
339:                        String aname = attributes.getQName(i).toLowerCase();
340:                        if (aname.equals("http-equiv")) {
341:                            httpEquiv = attributes.getValue(i);
342:                        } else if (aname.equals("content")) {
343:                            contentIndex = i;
344:                        }
345:                    }
346:                    if (httpEquiv != null
347:                            && httpEquiv.toLowerCase().equals("content-type")) {
348:                        fSeenHttpEquiv = true;
349:                        String content = null;
350:                        if (contentIndex != -1) {
351:                            originalContent = attributes.getValue(contentIndex);
352:                            content = originalContent.toLowerCase();
353:                        }
354:                        if (content != null) {
355:                            int charsetIndex = content.indexOf("charset=");
356:                            if (charsetIndex != -1) {
357:                                content = content
358:                                        .substring(0, charsetIndex + 8);
359:                            } else {
360:                                content += ";charset=";
361:                            }
362:                            content += fEncoding;
363:                            attributes.setValue(contentIndex, content);
364:                        }
365:                    }
366:                }
367:
368:                // print element
369:                fPrinter.print('<');
370:                fPrinter.print(element.rawname);
371:                int attrCount = attributes != null ? attributes.getLength() : 0;
372:                for (int i = 0; i < attrCount; i++) {
373:                    String aname = attributes.getQName(i);
374:                    String avalue = attributes.getValue(i);
375:                    fPrinter.print(' ');
376:                    fPrinter.print(aname);
377:                    fPrinter.print("=\"");
378:                    printAttributeValue(avalue);
379:                    fPrinter.print('"');
380:                }
381:                fPrinter.print('>');
382:                fPrinter.flush();
383:
384:                // return original META[@http-equiv]/@content value
385:                if (contentIndex != -1) {
386:                    attributes.setValue(contentIndex, originalContent);
387:                }
388:
389:            } // printStartElement(QName,XMLAttributes)
390:
391:            /** Print end element. */
392:            protected void printEndElement(QName element) {
393:                fPrinter.print("</");
394:                fPrinter.print(element.rawname);
395:                fPrinter.print('>');
396:                fPrinter.flush();
397:            } // printEndElement(QName)
398:
399:            /** Print entity. */
400:            protected void printEntity(String name) {
401:                fPrinter.print('&');
402:                fPrinter.print(name);
403:                fPrinter.print(';');
404:                fPrinter.flush();
405:            } // printEntity(String)
406:
407:            //
408:            // MAIN
409:            //
410:
411:            /** Main. */
412:            public static void main(String[] argv) throws Exception {
413:                if (argv.length == 0) {
414:                    printUsage();
415:                    System.exit(1);
416:                }
417:                XMLParserConfiguration parser = new HTMLConfiguration();
418:                parser.setFeature(NOTIFY_CHAR_REFS, true);
419:                parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
420:                String iencoding = null;
421:                String oencoding = "Windows-1252";
422:                boolean identity = false;
423:                boolean purify = false;
424:                for (int i = 0; i < argv.length; i++) {
425:                    String arg = argv[i];
426:                    if (arg.equals("-ie")) {
427:                        iencoding = argv[++i];
428:                        continue;
429:                    }
430:                    if (arg.equals("-e") || arg.equals("-oe")) {
431:                        oencoding = argv[++i];
432:                        continue;
433:                    }
434:                    if (arg.equals("-i")) {
435:                        identity = true;
436:                        continue;
437:                    }
438:                    if (arg.equals("-p")) {
439:                        purify = true;
440:                        continue;
441:                    }
442:                    if (arg.equals("-h")) {
443:                        printUsage();
444:                        System.exit(1);
445:                    }
446:                    java.util.Vector filtersVector = new java.util.Vector(2);
447:                    if (identity) {
448:                        filtersVector.addElement(new Identity());
449:                    } else if (purify) {
450:                        filtersVector.addElement(new Purifier());
451:                    }
452:                    filtersVector.addElement(new Writer(System.out, oencoding));
453:                    XMLDocumentFilter[] filters = new XMLDocumentFilter[filtersVector
454:                            .size()];
455:                    filtersVector.copyInto(filters);
456:                    parser.setProperty(FILTERS, filters);
457:                    XMLInputSource source = new XMLInputSource(null, arg, null);
458:                    source.setEncoding(iencoding);
459:                    parser.parse(source);
460:                }
461:            } // main(String[])
462:
463:            /** Print usage. */
464:            private static void printUsage() {
465:                System.err.println("usage: java " + Writer.class.getName()
466:                        + " (options) file ...");
467:                System.err.println();
468:                System.err.println("options:");
469:                System.err
470:                        .println("  -ie name  Specify IANA name of input encoding.");
471:                System.err
472:                        .println("  -oe name  Specify IANA name of output encoding.");
473:                System.err.println("  -i        Perform identity transform.");
474:                System.err
475:                        .println("  -p        Purify output to ensure XML well-formedness.");
476:                System.err.println("  -h        Display help screen.");
477:                System.err.println();
478:                System.err.println("notes:");
479:                System.err
480:                        .println("  The -i and -p options are mutually exclusive.");
481:                System.err
482:                        .println("  The -e option has been replaced with -oe.");
483:            } // printUsage()
484:
485:        } // class Writer
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.