Source Code Cross Referenced for Page.java in » Web-Crawler » WebSPHINX » websphinx » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » WebSPHINX » websphinx
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * WebSphinx web-crawling toolkit
003:         *
004:         * Copyright (c) 1998-2002 Carnegie Mellon University.  All rights
005:         * reserved.
006:         *
007:         * Redistribution and use in source and binary forms, with or without
008:         * modification, are permitted provided that the following conditions
009:         * are met:
010:         *
011:         * 1. Redistributions of source code must retain the above copyright
012:         *    notice, this list of conditions and the following disclaimer.
013:         *
014:         * 2. Redistributions in binary form must reproduce the above copyright
015:         *    notice, this list of conditions and the following disclaimer in
016:         *    the documentation and/or other materials provided with the
017:         *    distribution.
018:         *
019:         * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
020:         * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
021:         * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
022:         * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
023:         * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
024:         * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
025:         * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
026:         * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
027:         * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
028:         * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
029:         * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
030:         *
031:         */
032:
033:        package websphinx;
034:
035:        import java.net.URL;
036:        import java.net.URLConnection; //#ifdef JDK1.1 
037:        import java.net.HttpURLConnection; //#endif JDK1.1
038:        import java.io.IOException;
039:        import java.io.InputStream;
040:        import rcm.util.Str;
041:
042:        /**
043:         * A Web page.  Although a Page can represent any MIME type, it mainly
044:         * supports HTML pages, which are automatically parsed.  The parsing produces
045:         * a list of tags, a list of words, an HTML parse tree, and a list of links.
046:         */
047:        public class Page extends Region {
048:
049:            // typical page length, to optimize downloads
050:            static final int TYPICAL_LENGTH = 20240;
051:
052:            // Permanent content
053:            Link origin;
054:            long lastModified = 0;
055:            long expiration = 0;
056:            String contentType;
057:            String contentEncoding;
058:            int responseCode = -1;
059:            String responseMessage = null;
060:            URL base;
061:            String title;
062:            Link[] links;
063:
064:            int contentLock;
065:            // If page was downloaded from Net, represents number of 
066:            //    callers who want to keep the content.
067:            // If page was created from a string, set to -1. 
068:
069:            // Discardable content (thrown away when contentLock falls to 0)
070:            byte[] contentBytes;
071:            String content;
072:            Region[] tokens;
073:            Text[] words;
074:            Tag[] tags;
075:            Element[] elements;
076:            Element root;
077:            String canonicalTags;
078:
079:            /**
080:             * Make a Page by downloading and parsing a Link.
081:             * @param link Link to download
082:             */
083:            public Page(Link link) throws IOException {
084:                this (link, DownloadParameters.NO_LIMITS, new HTMLParser());
085:            }
086:
087:            /**
088:             * Make a Page by downloading a Link.
089:             * @param link Link to download
090:             * @param dp Download parameters to use
091:             */
092:            public Page(Link link, DownloadParameters dp) throws IOException {
093:                this (link, dp, new HTMLParser());
094:            }
095:
096:            /**
097:             * Make a Page by downloading a Link.
098:             * @param link Link to download
099:             * @param parser HTML parser to use
100:             */
101:            public Page(Link link, DownloadParameters dp, HTMLParser parser)
102:                    throws IOException {
103:                super (null, 0, 0);
104:                source = this ;
105:                origin = link;
106:                base = getURL();
107:                download(dp, parser);
108:                link.setPage(this );
109:            }
110:
111:            /**
112:             * Make a Page from a URL and a string of HTML.
113:             * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
114:             * @param url URL to use as a base for relative links on the page
115:             * @param html the HTML content of the page
116:             */
117:            public Page(URL url, String html) {
118:                this (url, html, new HTMLParser());
119:            }
120:
121:            /**
122:             * Make a Page from a URL and a string of HTML.
123:             * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
124:             * @param url URL to use as a base for relative links on the page
125:             * @param html the HTML content of the page
126:             * @param parser HTML parser to use
127:             */
128:            public Page(URL url, String html, HTMLParser parser) {
129:                super (null, 0, html.length());
130:                source = this ;
131:                base = url;
132:                this .content = html;
133:                this .contentBytes = html.getBytes();
134:                contentLock = -1;
135:                parse(parser);
136:            }
137:
138:            /**
139:             * Make a Page from a string of content.  The content is not parsed. 
140:             * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
141:             * @param content HTML content of the page */
142:            public Page(String content) {
143:                super (null, 0, content.length());
144:                // FIX: don't think base==null will work
145:                source = this ;
146:                this .content = content;
147:                this .contentBytes = content.getBytes();
148:                contentLock = -1;
149:            }
150:
151:            /**
152:             * Make a Page from a byte array of content.  The content is not parsed. 
153:             * The created page has no originating link, so calls to getURL(), getProtocol(), etc. will fail.
154:             * @param content byte content of the page */
155:            public Page(byte[] content) {
156:                super (null, 0, content.length);
157:                // FIX: don't think base==null will work
158:                source = this ;
159:                this .contentBytes = new byte[content.length];
160:                System.arraycopy(content, 0, this .contentBytes, 0,
161:                        content.length);
162:                this .content = new String(content);
163:                contentLock = -1;
164:            }
165:
166:            //
167:            // Downloading
168:            //
169:
170:            // This code generates SecurityExceptions in Netscape 4.0,
171:            // and it doesn't seem to be necessary anyway: redirects are followed
172:            // by Netscape and JDK by default, despite the fact that the JDK
173:            // docs claim that setFollowRedirects() defaults to false
174:
175:            //static {
176:            //try {
177:            //  HttpURLConnection.setFollowRedirects (true);
178:            //} catch (Throwable t) { }
179:            //}
180:
181:            /*
182:             * Download the page.  The downloaded page is parsed 
183:             * if its MIME type is HTML or unspecified.
184:             * @param parser HTML parser to use
185:             * @exception IOException if an error occurs in downloading the page
186:             */
187:            public void download(DownloadParameters dp, HTMLParser parser)
188:                    throws IOException {
189:                URLConnection conn = Access.getAccess().openConnection(origin);
190:
191:                // fetch and store final redirected URL and response headers
192:                InputStream in = conn.getInputStream();
193:                base = conn.getURL();
194:                lastModified = conn.getLastModified();
195:                expiration = conn.getExpiration();
196:                contentType = conn.getContentType();
197:                contentEncoding = conn.getContentEncoding();
198:
199:                //#ifdef JDK1.1 
200:                // get HTTP response codes
201:                if (conn instanceof  HttpURLConnection) {
202:                    HttpURLConnection httpconn = (HttpURLConnection) conn;
203:
204:                    responseCode = httpconn.getResponseCode();
205:                    responseMessage = httpconn.getResponseMessage();
206:                    if (responseMessage == null)
207:                        responseMessage = "unknown error";
208:
209:                    if (responseCode >= 300)
210:                        // HTTP failure
211:                        throw new IOException(responseCode + " "
212:                                + responseMessage);
213:                }
214:                //#endif JDK1.1
215:
216:                //     System.err.println ("Original URL: " + origin.getURL());
217:                //     System.err.println ("Final URL: " + conn.getURL());
218:
219:                // download content
220:                int maxKB = dp.getMaxPageSize();
221:                int maxBytes = (maxKB > 0) ? maxKB * 1024 : Integer.MAX_VALUE;
222:                int expectedLength = conn.getContentLength();
223:                if (expectedLength > maxBytes)
224:                    throw new IOException("Page greater than " + maxBytes
225:                            + " bytes");
226:                if (expectedLength == -1)
227:                    expectedLength = TYPICAL_LENGTH;
228:                byte[] buf = new byte[expectedLength];
229:                int n;
230:                int total = 0;
231:
232:                while ((n = in.read(buf, total, buf.length - total)) != -1) {
233:                    total += n;
234:                    if (total > maxBytes)
235:                        throw new IOException("Page greater than " + maxBytes
236:                                + " bytes");
237:                    if (total == buf.length) {
238:                        // try to read one more character
239:                        int c = in.read();
240:                        if (c == -1)
241:                            break; // EOF, we're done
242:                        else {
243:                            // need more space in array.  Double the array, but don't make
244:                            // it bigger than maxBytes.
245:                            byte[] newbuf = new byte[Math.min(buf.length * 2,
246:                                    maxBytes)];
247:                            System.arraycopy(buf, 0, newbuf, 0, buf.length);
248:                            buf = newbuf;
249:                            buf[total++] = (byte) c;
250:                        }
251:                    }
252:                }
253:                in.close();
254:
255:                if (total != buf.length) {
256:                    // resize the array to be precisely total bytes long
257:                    byte[] newbuf = new byte[total];
258:                    System.arraycopy(buf, 0, newbuf, 0, total);
259:                    buf = newbuf;
260:                }
261:
262:                contentBytes = buf;
263:                content = new String(buf);
264:                start = 0;
265:                end = total;
266:                contentLock = 1;
267:
268:                //  parse the response
269:                if (contentType == null || contentType.startsWith("text/html")
270:                        || contentType.startsWith("content/unknown"))
271:                    parse(parser);
272:            }
273:
274:            void downloadSafely() {
275:                try {
276:                    download(new DownloadParameters(), new HTMLParser());
277:                } catch (Throwable e) {
278:                }
279:            }
280:
281:            //
282:            // Parsing
283:            //
284:
285:            /**
286:             * Parse the page.  Assumes the page has already been downloaded.
287:             * @param parser HTML parser to use
288:             * @exception RuntimeException if an error occurs in downloading the page
289:             */
290:            public void parse(HTMLParser parser) {
291:                if (!hasContent())
292:                    downloadSafely();
293:                try {
294:                    parser.parse(this );
295:                } catch (IOException e) {
296:                    throw new RuntimeException(e.toString());
297:                }
298:            }
299:
300:            /**
301:             * Test whether page has been parsed.  Pages are parsed during 
302:             * download only if its MIME type is HTML or unspecified.
303:             * @return true if page was parsed, false if not
304:             */
305:            public boolean isParsed() {
306:                return tokens != null;
307:            }
308:
309:            /**
310:             * Test whether page is HTML.
311:             * @return true if page is HTML.
312:             */
313:            public boolean isHTML() {
314:                return root != null;
315:            }
316:
317:            /**
318:             * Test whether page is a GIF or JPEG image.
319:             * @return true if page is a GIF or JPEG image, false if not
320:             */
321:            public boolean isImage() {
322:                byte[] bytes = getContentBytes();
323:                return startsWith(bytes, GIF_MAGIC)
324:                        || startsWith(bytes, JPG_MAGIC);
325:            }
326:
327:            private static final byte[] GIF_MAGIC = { (byte) 'G', (byte) 'I',
328:                    (byte) 'F', (byte) '8' };
329:            private static final byte[] JPG_MAGIC = { (byte) 0377, (byte) 0330,
330:                    (byte) 0377, (byte) 0340, (byte) 0, (byte) 020, (byte) 'J',
331:                    (byte) 'F', (byte) 'I', (byte) 'F' };
332:
333:            private boolean startsWith(byte[] bytes, byte[] prefix) {
334:                if (prefix.length > bytes.length)
335:                    return false;
336:                for (int i = 0, n = prefix.length; i < n; ++i)
337:                    if (bytes[i] != prefix[i])
338:                        return false;
339:                return true;
340:            }
341:
342:            //
343:            // Content management
344:            //
345:
346:            /**
347:             * Lock the page's content (to prevent it from being discarded).
348:             * This method increments a lock counter, representing all the 
349:             * callers interested in preserving the content.  The lock
350:             * counter is set to 1 when the page is initially downloaded.
351:             */
352:            public void keepContent() {
353:                if (contentLock > 0)
354:                    ++contentLock;
355:            }
356:
357:            /**
358:             * Unlock the page's content (allowing it to be garbage-collected, to
359:             * save space during a Web crawl).  This method decrements a lock counter.
360:             * If the counter falls to
361:             * 0 (meaning no callers are interested in the content), 
362:             * the content is released.  At least the following
363:             * fields are discarded: content, tokens, tags, words, elements, and
364:             * root.  After the content has been discarded, calling getContent()
365:             * (or getTokens(), getTags(), etc.) will force the page to be downloaded
366:             * again.  Hopefully the download will come from the cache, however.
367:             * <P> Links are not considered part of the content, and are not subject to
368:             * discarding by this method.  Also, if the page was created from a string
369:             * (rather than by downloading), its content is not subject to discarding 
370:             * (since there would be no way to recover it). 
371:             */
372:            public void discardContent() {
373:                if (contentLock == 0) // already discarded
374:                    return;
375:
376:                if (--contentLock > 0) // somebody else still has a lock on the content
377:                    return;
378:
379:                if (origin == null)
380:                    return; // without an origin, we'd have no way to recover this page
381:
382:                //System.err.println ("discarding content of " + toDescription());
383:                contentBytes = null;
384:                content = null;
385:                tokens = null;
386:                tags = null;
387:                words = null;
388:                elements = null;
389:                root = null;
390:                canonicalTags = null;
391:
392:                // keep links, but isolate them from the element tree
393:                if (links != null) {
394:                    for (int i = 0; i < links.length; ++i)
395:                        if (links[i] instanceof  Link)
396:                            ((Link) links[i]).discardContent();
397:                }
398:
399:                // FIX: debugging only: disconnect this page from its parent
400:                //origin.page = null;
401:                //origin = null;
402:
403:                contentLock = 0;
404:            }
405:
406:            /**
407:             * Test if page content is available.
408:             * @return true if content is downloaded and available, false if content has not been downloaded 
409:             * or has been discarded.
410:             */
411:            public final boolean hasContent() {
412:                return contentLock != 0;
413:            }
414:
415:            //
416:            // Page accessors
417:            //
418:
419:            /**
420:             * Get depth of page in crawl.
421:             * @return depth of page from root (depth of page is same as depth of its originating link)
422:             */
423:            public int getDepth() {
424:                return origin != null ? origin.getDepth() : 0;
425:            }
426:
427:            /**
428:             * Get the Link that points to this page.
429:             * @return the Link object that was used to download this page.
430:             */
431:            public Link getOrigin() {
432:                return origin;
433:            }
434:
435:            /**
436:             * Get the base URL, relative to which the page's links were interpreted.
437:             * The base URL defaults to the URL of the 
438:             * Link that was used to download the page.  If any redirects occur
439:             * while downloading the page, the final location becomes the new base
440:             * URL.  Lastly, if a <BASE> element is found in the page, that
441:             * becomes the new base URL.
442:             * @return the page's base URL.
443:             */
444:            public URL getBase() {
445:                return base;
446:            }
447:
448:            /**
449:             * Get the URL.
450:             * @return the URL of the link that was used to download this page
451:             */
452:            public URL getURL() {
453:                return origin != null ? origin.getURL() : null;
454:            }
455:
456:            /**
457:             * Get the title of the page.
458:             * @return the page's title, or null if the page hasn't been parsed.
459:             */
460:            public String getTitle() {
461:                return title;
462:            }
463:
464:            /**
465:             * Get the content of the page as a String.  May not work properly for
466:             * binary data like images; use getContentBytes instead.
467:             * @return the String content of the page.
468:             */
469:            public String getContent() {
470:                if (!hasContent())
471:                    downloadSafely();
472:                return content;
473:            }
474:
475:            /**
476:             * Get the content of the page as an array of bytes.
477:             * @return the content of the page in binary form.
478:             */
479:            public byte[] getContentBytes() {
480:                if (!hasContent())
481:                    downloadSafely();
482:                return contentBytes;
483:            }
484:
485:            /**
486:             * Get the token sequence of the page.  Tokens are tags and whitespace-delimited text.
487:             * @return token regions in the page, or null if the page hasn't been downloaded or parsed.
488:             */
489:            public Region[] getTokens() {
490:                if (!hasContent())
491:                    downloadSafely();
492:                return tokens;
493:            }
494:
495:            /**
496:             * Get the tag sequence of the page.
497:             * @return tags in the page, or null if the page hasn't been downloaded or parsed.
498:             */
499:            public Tag[] getTags() {
500:                if (!hasContent())
501:                    downloadSafely();
502:                return tags;
503:            }
504:
505:            /**
506:             * Get the words in the page.  Words are whitespace- and tag-delimited text.
507:             * @return words in the page, or null if the page hasn't been downloaded or parsed.
508:             */
509:            public Text[] getWords() {
510:                if (!hasContent())
511:                    downloadSafely();
512:                return words;
513:            }
514:
515:            /**
516:             * Get the HTML elements in the page.  All elements in the page
517:             * are included in the list, in the order they would appear in
518:             * an inorder traversal of the HTML parse tree.
519:             * @return HTML elements in the page ordered by inorder, or null if the page
520:             * hasn't been downloaded or parsed.
521:             */
522:            public Element[] getElements() {
523:                if (!hasContent())
524:                    downloadSafely();
525:                return elements;
526:            }
527:
528:            /**
529:             * Get the root HTML element of the page.
530:             * @return first top-level HTML element in the page, or null 
531:             * if the page hasn't been downloaded or parsed.
532:             */
533:            public Element getRootElement() {
534:                if (!hasContent())
535:                    downloadSafely();
536:                return root;
537:            }
538:
539:            /**
540:             * Get the links found in the page.
541:             * @return links in the page, or null 
542:             * if the page hasn't been downloaded or parsed.
543:             */
544:            public Link[] getLinks() {
545:                return links;
546:            }
547:
548:            /**
549:             * Convert the link's URL to a String
550:             * @return the URL represented as a string
551:             */
552:            public String toURL() {
553:                return origin != null ? origin.toURL() : null;
554:            }
555:
556:            /**
557:             * Generate a human-readable description of the page.
558:             * @return a description of the link, in the form "title [url]".
559:             */
560:            public String toDescription() {
561:                return (title != null && title.length() > 0 ? title + " " : "")
562:                        + "[" + getURL() + "]";
563:            }
564:
565:            /**
566:             * Get page containing the region.
567:             * @return page containing the region
568:             */
569:            public String toString() {
570:                return getContent();
571:            }
572:
573:            /**
574:             * Get last-modified date of page.
575:             * @return the date when the page was last modified, or 0 if not known. 
576:             * The value is number of seconds since January 1, 1970 GMT
577:             */
578:            public long getLastModified() {
579:                return lastModified;
580:            }
581:
582:            /**
583:             * Set last-modified date of page.
584:             * @param last the date when the page was last modified, or 0 if not known. 
585:             * The value is number of seconds since January 1, 1970 GMT
586:             */
587:            public void setLastModified(long last) {
588:                lastModified = last;
589:            }
590:
591:            /**
592:             * Get expiration date of page.
593:             * @return the expiration date of the page, or 0 if not known. 
594:             * The value is number of seconds since January 1, 1970 GMT.
595:             */
596:            public long getExpiration() {
597:                return expiration;
598:            }
599:
600:            /**
601:             * Set expiration date of page.
602:             * @param expire the expiration date of the page, or 0 if not known. 
603:             * The value is number of seconds since January 1, 1970 GMT.
604:             */
605:            public void setExpiration(long expire) {
606:                expiration = expire;
607:            }
608:
609:            /**
610:             * Get MIME type of page.
611:             * @return the MIME type of page, such as "text/html", or null if not known. 
612:             */
613:            public String getContentType() {
614:                return contentType;
615:            }
616:
617:            /**
618:             * Set MIME type of page.
619:             * @param type the MIME type of page, such as "text/html", or null if not known. 
620:             */
621:            public void setContentType(String type) {
622:                contentType = type;
623:            }
624:
625:            /**
626:             * Get content encoding of page.
627:             * @return the encoding type of page, such as "base-64", or null if not known. 
628:             */
629:            public String getContentEncoding() {
630:                return contentEncoding;
631:            }
632:
633:            /**
634:             * Set content encoding of page.
635:             * @param encoding the encoding type of page, such as "base-64", or null if not known. 
636:             */
637:            public void setContentEncoding(String encoding) {
638:                contentEncoding = encoding;
639:            }
640:
641:            /**
642:             * Get response code returned by the Web server.  For list of
643:             * possible values, see java.net.HttpURLConnection.
644:             * @return response code, such as 200 (for OK) or 404 (not found).
645:             * Code is -1 if unknown.
646:             * @see java.net.HttpURLConnection
647:             */
648:            public int getResponseCode() {
649:                return responseCode;
650:            }
651:
652:            /**
653:             * Get response message returned by the Web server.
654:             * @return response message, such as "OK" or "Not Found".  The response message is null if the page failed to be fetched or not known. 
655:             */
656:            public String getResponseMessage() {
657:                return responseMessage;
658:            }
659:
660:            /**
661:             * Get raw content found in a region.
662:             * @param start starting offset of region
663:             * @param end ending offset of region
664:             * @return raw HTML contained in the region
665:             */
666:            public String substringContent(int start, int end) {
667:                return getContent().substring(start, end);
668:            }
669:
670:            /**
671:             * Get HTML found in a region.
672:             * @param start starting offset of region
673:             * @param end ending offset of region
674:             * @return representation of region as HTML
675:             */
676:            public String substringHTML(int start, int end) {
677:                String s = getContent().substring(start, end);
678:                if (!isHTML()) {
679:                    s = Str.replace(s, "&", "&amp;");
680:                    s = Str.replace(s, "<", "&lt;");
681:                    s = Str.replace(s, ">", "&gt;");
682:                    s = "<PRE>" + s + "</PRE>";
683:                }
684:                return s;
685:            }
686:
687:            /**
688:             * Get tagless text found in a region.
689:             * Runs of whitespace and tags are reduced to a single space character.
690:             * @param start starting offset of region
691:             * @param end ending offset of region
692:             * @return tagless text contained in the region
693:             */
694:            public String substringText(int start, int end) {
695:                if (words == null)
696:                    return ""; // page is not parsed
697:
698:                // FIX: find some other mapping
699:                StringBuffer buf = new StringBuffer();
700:                for (int j = findStart(words, start); j < words.length; ++j) {
701:                    if (words[j].end > end)
702:                        break;
703:                    else {
704:                        if (buf.length() > 0)
705:                            buf.append(' ');
706:                        buf.append(words[j].text);
707:                    }
708:                }
709:                return buf.toString();
710:            }
711:
712:            /**
713:             * Get HTML tags found in a region.  Whitespace and text among the
714:             * tags are deleted.
715:             * @param start starting offset of region
716:             * @param end ending offset of region
717:             * @return tags contained in the region
718:             */
719:            public String substringTags(int start, int end) {
720:                if (tags == null)
721:                    return ""; // page is not parsed
722:
723:                // FIX: find some other mapping
724:                StringBuffer buf = new StringBuffer();
725:                for (int j = findStart(tags, start); j < tags.length; ++j) {
726:                    if (tags[j].end > end)
727:                        break;
728:                    else {
729:                        if (buf.length() > 0)
730:                            buf.append(' ');
731:                        buf.append(getContent().substring(tags[j].start,
732:                                tags[j].end));
733:                    }
734:                }
735:                return buf.toString();
736:            }
737:
738:            /**
739:             * Get canonicalized HTML tags found in a region.
740:             * A canonicalized tag looks like the following:
741:             * <PRE>
742:             * &lt;tagname#index attr=value attr=value attr=value ...&gt
743:             * <PRE>
744:             * where tagname and attr are all lowercase, index is the tag's
745:             * index in the page's tokens array.  Attributes are sorted in
746:             * increasing order by attribute name. Attributes without values
747:             * omit the entire "=value" portion.  Values are delimited by a 
748:             * space.  All occurences of &lt, &gt, space, and % characters 
749:             * in a value are URL-encoded (e.g., space is converted to %20).  
750:             * Thus the only occurences of these characters in the canonical 
751:             * tag are the tag delimiters.
752:             *
753:             * <P>For example, raw HTML that looks like:
754:             * <PRE>
755:             * &lt;IMG SRC="http://foo.com/map&lt;&gt;.gif" ISMAP&gt;Image&lt;/IMG&gt;
756:             * </PRE>
757:             * would be canonicalized to:
758:             * <PRE>
759:             * &lt;img ismap src=http://foo.com/map%3C%3E.gif&gt;&lt;/img&gt;
760:             * </PRE>
761:             * <P>
762:             * Comment and declaration tags (whose tag name is !) are omitted
763:             * from the canonicalization.
764:             *
765:             * @param start starting offset of region
766:             * @param end ending offset of region
767:             * @return canonicalized tags contained in the region
768:             */
769:            public String substringCanonicalTags(int start, int end) {
770:                if (tokens == null)
771:                    return ""; // page is not parsed
772:
773:                boolean all = (start == this .start && end == this .end);
774:
775:                if (all && canonicalTags != null)
776:                    return canonicalTags;
777:
778:                // FIX: find some other mapping
779:                StringBuffer buf = new StringBuffer();
780:                for (int j = findStart(tokens, start); j < tokens.length; ++j) {
781:                    if (tokens[j].end > end)
782:                        break;
783:                    else if (tokens[j] instanceof  Tag)
784:                        Tagexp.canonicalizeTag(buf, (Tag) tokens[j], j);
785:                }
786:
787:                String result = buf.toString();
788:                if (all)
789:                    canonicalTags = result;
790:                return result;
791:            }
792:
793:            public static void main(String[] args) throws Exception {
794:                int method = Link.GET;
795:
796:                for (int i = 0; i < args.length; ++i) {
797:                    if (args[i].equals("-post"))
798:                        method = Link.POST;
799:                    else if (args[i].equals("-get"))
800:                        method = Link.GET;
801:                    else {
802:                        Link link = method == Link.GET ? new Link(args[i])
803:                                : new Link(args[i]); // FIX: POST?
804:                        try {
805:                            Page p = new Page(link);
806:                            System.out.write(p.getContentBytes());
807:                        } catch (IOException e) {
808:                            System.out.println(e);
809:                        }
810:                    }
811:                }
812:            }
813:
814:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.