Source Code Cross Referenced for WebSiteIndexer.java in » Portal » stringbeans-3.5 » com » nabhinc » portlet » search » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Portal » stringbeans 3.5 » com.nabhinc.portlet.search
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /* 
002:         * (C) Copyright 2004 Nabh Information Systems, Inc. 
003:         * 
004:         * This program is free software; you can redistribute it and/or
005:         * modify it under the terms of the GNU General Public License 
006:         * as published by the Free Software Foundation; either version 2
007:         * of the License, or (at your option) any later version.
008:         * 
009:         * This program is distributed in the hope that it will be useful,
010:         * but WITHOUT ANY WARRANTY; without even the implied warranty of 
011:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
012:         * GNU General Public License for more details.
013:         * 
014:         * You should have received a copy of the GNU General Public License
015:         * along with this program; if not, write to the Free Software 
016:         * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
017:         * 
018:         */
019:        package com.nabhinc.portlet.search;
020:
021:        import java.io.ByteArrayInputStream;
022:        import java.io.ByteArrayOutputStream;
023:        import java.io.File;
024:        import java.io.IOException;
025:        import java.io.InputStream;
026:        import java.io.InputStreamReader;
027:        import java.io.OutputStreamWriter;
028:        import java.net.URL;
029:        import java.net.URLConnection;
030:        import java.util.Date;
031:        import java.util.HashMap;
032:        import java.util.Vector;
033:        import java.util.regex.Matcher;
034:        import java.util.regex.Pattern;
035:
036:        import javax.portlet.PortletException;
037:
038:        import org.apache.commons.logging.Log;
039:        import org.apache.commons.logging.LogFactory;
040:        import org.apache.lucene.analysis.standard.StandardAnalyzer;
041:        import org.apache.lucene.document.DateField;
042:        import org.apache.lucene.document.Document;
043:        import org.apache.lucene.document.Field;
044:        import org.apache.lucene.index.IndexWriter;
045:        import org.htmlparser.NodeFilter;
046:        import org.htmlparser.Parser;
047:        import org.htmlparser.beans.StringBean;
048:        import org.htmlparser.filters.NodeClassFilter;
049:        import org.htmlparser.filters.TagNameFilter;
050:        import org.htmlparser.lexer.nodes.StringNode;
051:        import org.htmlparser.tags.LinkTag;
052:        import org.htmlparser.tags.TitleTag;
053:        import org.htmlparser.util.NodeList;
054:        import org.htmlparser.util.ParserException;
055:        import org.pdfbox.exceptions.CryptographyException;
056:        import org.pdfbox.exceptions.InvalidPasswordException;
057:        import org.pdfbox.pdmodel.PDDocument;
058:        import org.pdfbox.pdmodel.PDDocumentInformation;
059:        import org.pdfbox.util.PDFTextStripper;
060:
061:        /**
062:         * 
063:         *
064:         * @author Padmanabh Dabke
065:         * (c) 2004 Nabh Information Systems, Inc. All Rights Reserved.
066:         */
067:        public class WebSiteIndexer {
068:
069:            private static final char FILE_SEPARATOR = System.getProperty(
070:                    "file.separator").charAt(0);
071:            private static Log wsiLogger = LogFactory
072:                    .getLog(WebSiteIndexer.class);
073:
074:            public static void main(String[] args) {
075:                String seedURL = "http://localhost/stringbeans";
076:                String indexDir = "C:/temp/index";
077:                try {
078:                    String[] includeRegex = { "http://localhost" };
079:                    String[] excludeRegex = { ".*maximized.*", ".*minimized.*",
080:                            ".*process_action.*" };
081:                    createSiteIndex(seedURL, indexDir, includeRegex,
082:                            excludeRegex, 10);
083:                } catch (Exception ex) {
084:                    ex.printStackTrace();
085:                }
086:            }
087:
088:            public static void createSiteIndex(String seedURLStr,
089:                    String indexDirStr, String[] includeRegex,
090:                    String[] excludeRegex, int maxDepth)
091:                    throws PortletException {
092:                // IndexReader reader; // existing index
093:                IndexWriter writer; // new index being built
094:                // TermEnum uidIter; // document id iterator
095:                final TagNameFilter filter = new TagNameFilter("A");
096:                NodeList list;
097:                Parser parser = null;
098:
099:                // Check for a valid index directory. If the directory does not exist
100:                // create one.
101:                File indexDir = new File(indexDirStr);
102:                if (indexDir.isFile()) {
103:                    throw new PortletException(indexDirStr
104:                            + " is a file not a directory.");
105:                }
106:                if (!indexDir.exists()) {
107:                    indexDir.mkdirs();
108:                }
109:
110:                try {
111:                    writer = new IndexWriter(indexDirStr,
112:                            new StandardAnalyzer(), true);
113:                    writer.setMaxFieldLength(1000000);
114:                } catch (Exception ex) {
115:                    throw new PortletException(
116:                            "Could not create search index writer.", ex);
117:                }
118:
119:                // Holds all URLs we have visited
120:                HashMap urlCollection = new HashMap();
121:
122:                // URLs we are currently working on
123:                Vector currentURLs = new Vector();
124:
125:                // Links found in processing current generation documents
126:                Vector nextGenURLs = new Vector();
127:
128:                // Start of with the seed URL
129:                currentURLs.addElement(seedURLStr);
130:                int depth = 0;
131:                int includeLen = includeRegex == null ? 0 : includeRegex.length;
132:                int excludeLen = excludeRegex == null ? 0 : excludeRegex.length;
133:
134:                // Create pattern objects corresponding to inclusion/exclusion pattern
135:                // strings
136:                Pattern[] includePatterns = new Pattern[includeLen];
137:                for (int j = 0; j < includeLen; j++) {
138:                    includePatterns[j] = Pattern.compile(includeRegex[j]);
139:                }
140:                Pattern[] excludePatterns = new Pattern[excludeLen];
141:                for (int j = 0; j < excludeLen; j++) {
142:                    excludePatterns[j] = Pattern.compile(excludeRegex[j]);
143:                }
144:
145:                // Index URLs till we run out of depth or URLs
146:                while (depth < maxDepth && currentURLs.size() > 0) {
147:                    int len = currentURLs.size();
148:                    for (int count = 0; count < len; count++) {
149:                        String currentURL = (String) currentURLs
150:                                .elementAt(count);
151:                        System.out.println("Processing: " + currentURL);
152:                        boolean isNotHTML = true;
153:                        try {
154:                            URL docURL = new URL(currentURL);
155:                            URLConnection conn = docURL.openConnection();
156:                            String contentType = conn.getContentType();
157:                            Document doc = null;
158:                            if (contentType == null) {
159:                                // Index binary document
160:                                continue; // for now
161:                            } else if (contentType.indexOf("text/html") > -1) {
162:                                parser = new Parser(conn);
163:                                doc = getHTMLDocument(parser);
164:                                if (doc == null)
165:                                    continue;
166:                                isNotHTML = false;
167:                            } else if (contentType.indexOf("application/pdf") > -1
168:                                    || contentType.indexOf("application/x-pdf") > -1) {
169:                                // Index PDF document
170:                                doc = getPDFDocument(conn, currentURL);
171:                            } else {
172:                                continue;
173:                            }
174:
175:                            // Add the url as a field named "url".  Use an UnIndexed field, so
176:                            // that the url is just stored with the document, but is not searchable.
177:                            doc.add(new Field("url", docURL.toExternalForm(),
178:                                    Field.Store.YES, Field.Index.NO));
179:
180:                            // Add the last modified date of the file a field named "modified".  Use a
181:                            // Keyword field, so that it's searchable, but so that no attempt is made
182:                            // to tokenize the field into words.
183:                            doc.add(new Field("modified", DateField
184:                                    .timeToString(conn.getLastModified()),
185:                                    Field.Store.YES, Field.Index.UN_TOKENIZED));
186:
187:                            String uid = docURL.toExternalForm().replace(
188:                                    FILE_SEPARATOR, '\u0000')
189:                                    + "\u0000"
190:                                    + DateField.timeToString(conn
191:                                            .getLastModified());
192:
193:                            // Add the uid as a field, so that index can be incrementally maintained.
194:                            // This field is not stored with document, it is indexed, but it is not
195:                            // tokenized prior to indexing.
196:                            doc.add(new Field("uid", uid, Field.Store.NO,
197:                                    Field.Index.UN_TOKENIZED));
198:
199:                            // Add document to the index
200:                            writer.addDocument(doc);
201:
202:                            if (isNotHTML)
203:                                continue;
204:
205:                            // Extract links in the document
206:                            parser.reset();
207:                            list = parser.extractAllNodesThatMatch(filter);
208:                            skiplink: for (int i = 0; i < list.size(); i++) {
209:                                LinkTag link = (LinkTag) list.elementAt(i);
210:                                String url = link.getLink();
211:                                if (link.isHTTPLink()) {
212:                                    boolean found = false;
213:                                    for (int j = 0; j < includeLen; j++) {
214:                                        Matcher m = includePatterns[j]
215:                                                .matcher(url);
216:                                        if (m.find()) {
217:                                            found = true;
218:                                            break;
219:                                        }
220:                                    }
221:
222:                                    if (!found) {
223:                                        continue skiplink;
224:                                    }
225:
226:                                    for (int j = 0; j < excludeLen; j++) {
227:                                        Matcher m = excludePatterns[j]
228:                                                .matcher(url);
229:                                        if (m.find()) {
230:                                            continue skiplink;
231:                                        }
232:                                    }
233:                                    if (!urlCollection.containsKey(url)) {
234:                                        urlCollection.put(url, "");
235:                                        nextGenURLs.addElement(url);
236:                                    }
237:                                }
238:                            }
239:                        } catch (IOException ioe) {
240:                            wsiLogger.warn("Broken URL: " + currentURL, ioe);
241:                        } catch (ParserException ex) {
242:                            wsiLogger.warn("Broken URL: " + currentURL, ex);
243:                        }
244:                    }
245:                    depth++;
246:                    Vector tempVec = currentURLs;
247:                    currentURLs = nextGenURLs;
248:                    nextGenURLs = tempVec;
249:                    nextGenURLs.clear();
250:                }
251:
252:                System.out.println("Optimizing index...");
253:
254:                try {
255:                    writer.optimize();
256:                    writer.close();
257:                } catch (Exception ex) {
258:                    throw new PortletException(
259:                            "Could not optimize search index.", ex);
260:                }
261:
262:            }
263:
264:            private static String getSummary(String content) {
265:                if (content == null)
266:                    return "";
267:                if (content.length() < 200)
268:                    return content;
269:                int spaceIndex = content.indexOf(" ", 200);
270:                if (spaceIndex == -1)
271:                    return content;
272:                return content.substring(0, spaceIndex);
273:
274:            }
275:
276:            private static Document getHTMLDocument(Parser parser)
277:                    throws PortletException {
278:                final NodeFilter titleFilter = new TagNameFilter("TITLE");
279:                final NodeFilter stringFilter = new NodeClassFilter(
280:                        StringNode.class);
281:                Document doc = new Document();
282:
283:                try {
284:
285:                    // Add the title as a separate Text field, so that it can be searched
286:                    // separately.
287:                    NodeList list = parser
288:                            .extractAllNodesThatMatch(titleFilter);
289:                    String title = "Untitled";
290:                    if (list != null && list.size() != 0) {
291:                        TitleTag titleTag = (TitleTag) list.elementAt(0);
292:                        title = titleTag.getChildrenHTML();
293:                    }
294:                    doc.add(new Field("title", title, Field.Store.YES,
295:                            Field.Index.TOKENIZED));
296:                    System.out.println("Doc Title: " + title);
297:
298:                    // Add the tag-stripped contents as a Reader-valued Text field so it will
299:                    // get tokenized and indexed.
300:                    parser.reset();
301:                    StringBean sb = new StringBean();
302:                    sb.setLinks(false);
303:                    parser.reset();
304:                    parser.visitAllNodesWith(sb);
305:                    String content = sb.getStrings();
306:                    if (content == null)
307:                        return null;
308:                    doc.add(new Field("contents", content, Field.Store.YES,
309:                            Field.Index.TOKENIZED));
310:
311:                    // Add the summary as an UnIndexed field, so that it is stored and returned
312:                    // with hit documents for display.
313:                    String summary = getSummary(content);
314:                    System.out.println("Summary: ");
315:                    System.out.println(summary);
316:                    doc.add(new Field("summary", summary, Field.Store.YES,
317:                            Field.Index.NO));
318:
319:                    return doc;
320:                } catch (Exception ex) {
321:                    throw new PortletException(
322:                            "Failed to index HTML document.", ex);
323:                }
324:
325:            }
326:
327:            private static Document getPDFDocument(URLConnection connection,
328:                    String docURL) throws PortletException {
329:                try {
330:                    Document document = new Document();
331:                    connection.connect();
332:                    InputStream input = null;
333:                    try {
334:                        input = connection.getInputStream();
335:                        addPDFContent(document, input, docURL);
336:                    } finally {
337:                        if (input != null) {
338:                            input.close();
339:                        }
340:                    }
341:
342:                    // return the document
343:                    return document;
344:
345:                } catch (Exception ex) {
346:                    throw new PortletException("Failed to index PDF document.",
347:                            ex);
348:                }
349:            }
350:
351:            /**
352:             * This will add the contents to the lucene document.
353:             *
354:             * @param document The document to add the contents to.
355:             * @param is The stream to get the contents from.
356:             * @param documentLocation The location of the document, used just for debug messages.
357:             *
358:             * @throws IOException If there is an error parsing the document.
359:             */
360:            private static void addPDFContent(Document document,
361:                    InputStream is, String documentLocation)
362:                    throws PortletException {
363:                PDDocument pdfDocument = null;
364:                try {
365:                    pdfDocument = PDDocument.load(is);
366:
367:                    if (pdfDocument.isEncrypted()) {
368:                        //Just try using the default password and move on
369:                        pdfDocument.decrypt("");
370:                    }
371:
372:                    //create a tmp output stream with the size of the content.
373:                    ByteArrayOutputStream out = new ByteArrayOutputStream();
374:                    OutputStreamWriter writer = new OutputStreamWriter(out);
375:                    PDFTextStripper stripper = new PDFTextStripper();
376:                    stripper.writeText(pdfDocument, writer);
377:                    writer.close();
378:
379:                    byte[] contents = out.toByteArray();
380:                    InputStreamReader input = new InputStreamReader(
381:                            new ByteArrayInputStream(contents));
382:                    // Add the tag-stripped contents as a Reader-valued Text field so it will
383:                    // get tokenized and indexed.
384:                    document.add(new Field("contents", input));
385:
386:                    PDDocumentInformation info = pdfDocument
387:                            .getDocumentInformation();
388:                    if (info.getAuthor() != null) {
389:                        document.add(new Field("Author", info.getAuthor(),
390:                                Field.Store.YES, Field.Index.TOKENIZED));
391:                    }
392:                    if (info.getCreationDate() != null) {
393:                        Date date = info.getCreationDate().getTime();
394:                        //for some reason lucene cannot handle dates before the epoch
395:                        //and throws a nasty RuntimeException, so we will check and
396:                        //verify that this does not happen
397:                        if (date.getTime() >= 0) {
398:                            document.add(new Field("CreationDate", DateField
399:                                    .dateToString(date), Field.Store.YES,
400:                                    Field.Index.TOKENIZED));
401:                        }
402:                    }
403:                    if (info.getCreator() != null) {
404:                        document.add(new Field("Creator", info.getCreator(),
405:                                Field.Store.YES, Field.Index.TOKENIZED));
406:                    }
407:                    if (info.getKeywords() != null) {
408:                        document.add(new Field("Keywords", info.getKeywords(),
409:                                Field.Store.YES, Field.Index.TOKENIZED));
410:                    }
411:                    if (info.getModificationDate() != null) {
412:                        Date date = info.getModificationDate().getTime();
413:                        //for some reason lucene cannot handle dates before the epoch
414:                        //and throws a nasty RuntimeException, so we will check and
415:                        //verify that this does not happen
416:                        if (date.getTime() >= 0) {
417:                            document.add(new Field("ModificationDate",
418:                                    DateField.dateToString(date),
419:                                    Field.Store.YES, Field.Index.NO));
420:                        }
421:                    }
422:                    if (info.getProducer() != null) {
423:                        document.add(new Field("Producer", info.getProducer(),
424:                                Field.Store.YES, Field.Index.TOKENIZED));
425:                    }
426:                    if (info.getSubject() != null) {
427:                        document.add(new Field("Subject", info.getSubject(),
428:                                Field.Store.YES, Field.Index.TOKENIZED));
429:                    }
430:                    if (info.getTitle() != null) {
431:                        document.add(new Field("Title", info.getTitle(),
432:                                Field.Store.YES, Field.Index.TOKENIZED));
433:                    }
434:                    if (info.getTrapped() != null) {
435:                        document.add(new Field("Trapped", info.getTrapped(),
436:                                Field.Store.YES, Field.Index.TOKENIZED));
437:                    }
438:
439:                    int summarySize = Math.min(contents.length, 500);
440:                    // Add the summary as an UnIndexed field, so that it is stored and returned
441:                    // with hit documents for display.
442:                    document.add(new Field("summary", new String(contents, 0,
443:                            summarySize), Field.Store.YES, Field.Index.NO));
444:                } catch (CryptographyException e) {
445:                    throw new PortletException("Error decrypting document("
446:                            + documentLocation + ".", e);
447:                } catch (InvalidPasswordException e) {
448:                    //they didn't suppply a password and the default of "" was wrong.
449:                    throw new PortletException("Error: The document("
450:                            + documentLocation
451:                            + ") is encrypted and will not be indexed.", e);
452:                } catch (IOException e) {
453:                    throw new PortletException(
454:                            "IO Exception in indexing PDF document "
455:                                    + documentLocation + ".", e);
456:                } finally {
457:                    if (pdfDocument != null) {
458:                        try {
459:                            pdfDocument.close();
460:                        } catch (Exception ex) { // ignore 
461:                        }
462:                    }
463:                }
464:            }
465:
466:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.