Source Code Cross Referenced for AbstractExtractor.java in  » Search-Engine » zilverline » org » zilverline » extractors » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » zilverline » org.zilverline.extractors 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:         * Copyright 2003-2004 Michael Franken, Zilverline.
003:         *
004:         * The contents of this file, or the files included with this file, are subject to
005:         * the current version of ZILVERLINE Collaborative Source License for the
006:         * Zilverline Search Engine (the "License"); You may not use this file except in
007:         * compliance with the License.
008:         *
009:         * You may obtain a copy of the License at
010:         *
011:         *     http://www.zilverline.org.
012:         *
013:         * See the License for the rights, obligations and
014:         * limitations governing use of the contents of the file.
015:         *
016:         * The Original and Upgraded Code is the Zilverline Search Engine. The developer of
017:         * the Original and Upgraded Code is Michael Franken. Michael Franken owns the
018:         * copyrights in the portions it created. All Rights Reserved.
019:         *
020:         */
021:
022:        package org.zilverline.extractors;
023:
024:        import java.io.File;
025:        import java.io.IOException;
026:        import java.io.Reader;
027:
028:        import org.apache.commons.logging.Log;
029:        import org.apache.commons.logging.LogFactory;
030:
031:        import org.springframework.util.StringUtils;
032:
033:        import org.zilverline.core.Extractor;
034:        import org.zilverline.core.ParsedFileInfo;
035:        import org.zilverline.util.FileUtils;
036:        import org.zilverline.util.Utils;
037:
038:        /**
039:         * Abstract baseclass of extractors. Extractors extract all relevant info from a File, and return the info in a ParsedFileInfo
040:         * Object.
041:         * 
042:         * @author Michael Franken
043:         * @version $Revision: 1.20 $
044:         * 
045:         * @see org.zilverline.core.ParsedFileInfo
046:         */
047:        public abstract class AbstractExtractor implements  Extractor {
048:            /** default size of summary extracted from the file. */
049:            private static final int SUMMARY_SIZE = 200;
050:
051:            /**
052:             * logger for Commons logging. This is non-static final protected, such that it defines a log for all subclasses too.
053:             */
054:            protected final Log log = LogFactory.getLog(getClass().getName());
055:
056:            private final static Log log2 = LogFactory
057:                    .getLog(AbstractExtractor.class);
058:
059:            /** default size of summary extracted from the file. */
060:            private ParsedFileInfo fileInfo = new ParsedFileInfo();
061:
062:            /**
063:             * Set the file and all file related information of the document, such as length and modification date.
064:             * 
065:             * @param f The file that is being parsed
066:             */
067:            public final void setFile(final File f) {
068:                fileInfo.setFile(f);
069:                fileInfo.setSize(f.length());
070:                fileInfo.setModificationDate(f.lastModified());
071:            }
072:
073:            /**
074:             * Set the type of the document.
075:             * 
076:             * @param type such as EXCEL, PDF
077:             */
078:            public final void setType(final String type) {
079:                fileInfo.setType(type);
080:            }
081:
082:            /**
083:             * Set the author of the document.
084:             * 
085:             * @param author the author
086:             */
087:            public final void setAuthor(final String author) {
088:                fileInfo.setAuthor(author);
089:            }
090:
091:            /**
092:             * Set the isbn number of the document.
093:             * 
094:             * @param ISBN the ISBN number
095:             */
096:            public final void setISBN(final String ISBN) {
097:                fileInfo.setISBN(ISBN);
098:            }
099:
100:            /**
101:             * Set the title of the document.
102:             * 
103:             * @param title the title
104:             */
105:            public final void setTitle(final String title) {
106:                fileInfo.setTitle(title);
107:            }
108:
109:            /**
110:             * Set the size of the document.
111:             * 
112:             * @param size the size in bytes
113:             */
114:            public final void setSize(final long size) {
115:                fileInfo.setSize(size);
116:            }
117:
118:            /**
119:             * Set the modificationDate of the document.
120:             * 
121:             * @param modificationDate the modificationDate in milliseconds since January 1, 1970, 00:00:00 GMT
122:             */
123:            public final void setModificationDate(final long modificationDate) {
124:                fileInfo.setModificationDate(modificationDate);
125:            }
126:
127:            /**
128:             * Set the creationDate of the document.
129:             * 
130:             * @param creationDate the creationDate in milliseconds since January 1, 1970, 00:00:00 GMT
131:             */
132:            public final void setCreationDate(final long creationDate) {
133:                fileInfo.setCreationDate(creationDate);
134:            }
135:
136:            /**
137:             * Set the summary of the document.
138:             * 
139:             * @param summary the summary
140:             */
141:            public final void setSummary(final String summary) {
142:                fileInfo.setSummary(summary);
143:            }
144:
145:            /**
146:             * Extract the content from the given file. As a side effect other attributes of ParsedFileInfo may be set too.
147:             * 
148:             * Implementations should catch all checked exceptions, sensibly, And close all resources.
149:             * 
150:             * @param f The file to extract the content from.
151:             * 
152:             * @return Reader containing text-only content
153:             */
154:            public abstract Reader getContent(final File f);
155:
156:            /**
157:             * This method extracts all relevant info of the file as an ParsedFileInfo object. Uses getContent as callback.
158:             * 
159:             * @param f the File to extract content from
160:             * 
161:             * @return ParsedFileInfo the object containing relevant info of the provided file
162:             */
163:            public final ParsedFileInfo extractInfo(final File f) {
164:                if (f == null) {
165:                    log
166:                            .warn("Something went terribly wrong, file = null, returning null ");
167:                    return null;
168:                }
169:                try {
170:                    setFile(f);
171:
172:                    Reader reader = getContent(f);
173:                    fileInfo.setReader(reader);
174:                    // get the summary from the reader
175:                    if (reader != null) {
176:                        String summary = fileInfo.getSummary();
177:
178:                        if (!StringUtils.hasText(summary)) {
179:                            char[] sumChars = new char[SUMMARY_SIZE];
180:                            int numChars = 0;
181:                            try {
182:                                if (reader.markSupported()) {
183:                                    reader.mark(SUMMARY_SIZE);
184:                                    numChars = reader.read(sumChars);
185:                                    reader.reset();
186:                                }
187:                                if (numChars > 0) {
188:                                    summary = new String(sumChars, 0, numChars);
189:                                }
190:                                if (log.isDebugEnabled()) {
191:                                    log.debug("Summary extracted from reader: "
192:                                            + summary);
193:                                }
194:                                setSummary(getSummaryFromContent(summary));
195:                            } catch (IOException e) {
196:                                log.warn(
197:                                        "Error extracting summary form reader",
198:                                        e);
199:                            }
200:                        }
201:                    }
202:                    // Set the title if there's none yet
203:                    if (!StringUtils.hasLength(fileInfo.getTitle())) {
204:                        fileInfo.setTitle(FileUtils.getBasename(f));
205:                    }
206:                } catch (Exception e) {
207:                    // here we don't throw any, since we do not want to interrupt the indexing process
208:                    log.warn("Unexpected Error extracting content from  "
209:                            + f.getName(), e);
210:                } catch (OutOfMemoryError e) {
211:                    // this happens with very, very large Documents
212:                    log
213:                            .error(
214:                                    "Very Serious Error. Out of Memory for very large documents: "
215:                                            + f.getName()
216:                                            + ", try increasing your JVM heap  size: for example, start your server with option '-Xmx128m'."
217:                                            + " Skipping file.", e);
218:                } catch (Throwable e) {
219:                    log.error(
220:                            "Very Serious Error while extracting contents from: "
221:                                    + f.getName(), e);
222:                }
223:
224:                return fileInfo;
225:            }
226:
227:            /**
228:             * Get a ISBN number from the given text.
229:             * 
230:             * @param text the plain text, can be null
231:             * @return a valid ISBNnumber (10 characters without -) or else ""
232:             */
233:            public static String getISBNFromContent(final String text) {
234:                if (text == null) {
235:                    return "";
236:                }
237:                // ISBN:0764543857
238:                String ISBNnumber = "";
239:                int j;
240:                // does text contain ISBN or isbn?
241:                if (((j = text.indexOf("ISBN")) != -1)
242:                        || (j = text.indexOf("isbn")) != -1) {
243:                    // look 25 characters forward
244:                    ISBNnumber = text.substring(j, j + 25);
245:                    // remove ISBN.. (all text until first number)
246:                    ISBNnumber = ISBNnumber.replaceFirst("[\\D]+", "");
247:                    // remove all non-valid ISBN characters (0-9xX and - seem valid), remove - as well
248:                    ISBNnumber = ISBNnumber.replaceAll("[^0-9xX]", "");
249:                    if (ISBNnumber.length() > 10) {
250:                        ISBNnumber = ISBNnumber.substring(0, 10);
251:                    }
252:                    log2.debug("possible ISBN found: " + ISBNnumber);
253:                    if (!Utils.isValidISBNNumber(ISBNnumber)) {
254:                        return "";
255:                    }
256:                }
257:                return ISBNnumber;
258:            }
259:
260:            /**
261:             * Get a summary from the given text.
262:             * 
263:             * @param text the plain text, can be null
264:             * @return the summary
265:             */
266:            public static String getSummaryFromContent(final String text) {
267:                if (!StringUtils.hasText(text)) {
268:                    return "";
269:                }
270:                // alternative: just the first characters:
271:                String summary = text.substring(0, Math.min(text.length(),
272:                        SUMMARY_SIZE));
273:                // SimpleSummariser sum = new SimpleSummariser();
274:                // get two representative lines
275:                // String summary = sum.summarise(text, 2);
276:                // return with minimal whitespace
277:                return summary.replaceAll("\\s+", " ");
278:            }
279:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.