Source Code Cross Referenced for HttpDocToFile.java in » Web-Crawler » JoBo » net » matuschek » http » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » JoBo » net.matuschek.http

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        package net.matuschek.http;
002:
003:        /************************************************
004:         Copyright (c) 2001/2002 by Daniel Matuschek
005:         *************************************************/
006:
007:        import java.io.BufferedOutputStream;
008:        import java.io.File;
009:        import java.io.FileInputStream;
010:        import java.io.FileOutputStream;
011:        import java.io.IOException;
012:        import java.net.URL;
013:        import java.util.StringTokenizer;
014:
015:        import org.apache.log4j.Category;
016:
017:        /**
018:         * DocumentManager that will store document contents in a file.
019:         *
020:         * @author Daniel Matuschek 
021:         * @version $Revision: 1.11 $
022:         */
023:        public class HttpDocToFile extends AbstractHttpDocManager {
024:            /**
025:             * directory where the files will be created
026:             */
027:            private String baseDir;
028:
029:            /**
030:             * the object will not store files smaller then this size !
031:             */
032:            private int minFileSize;
033:
034:            /**
035:             * defines if special characters in the URL should be replaced
036:             * by "normal" characters
037:             * @see #setReplaceAllSpecials(boolean)
038:             */
039:            private boolean replaceAllSpecials = false;
040:
041:            /**
042:             * defines, if CGIs should be stored on disc. 
043:             *
044:             * @see #setStoreCGI
045:             */
046:            private boolean storeCGI = true;
047:
048:            /** Log4J logging */
049:            private Category log;
050:
051:            /**
052:             * creates a new HttpDocToFile object that will store the
053:             * documents in the given directory
054:             */
055:            public HttpDocToFile(String baseDir) {
056:                this .baseDir = baseDir;
057:                log = Category.getInstance(getClass().getName());
058:            }
059:
060:            /**
061:             * store document (that means write it to disk)
062:             * @param doc the document to store
063:             * @exception DocManagerException if the document can't be stored
064:             * (some IO error occured)
065:             */
066:            public void storeDocument(HttpDoc doc) throws DocManagerException {
067:                if ((doc == null) || (doc.getContent() == null)) {
068:                    return;
069:                }
070:
071:                /* 
072:                 * write file only, if this was NOT a cached document
073:                 * (in this case we have it already on harddisk)
074:                 */
075:                if (doc.isCached()) {
076:                    return;
077:                }
078:
079:                if ((!storeCGI) && (doc.getURL().toString().indexOf('?') >= 0)) {
080:                    // do not store dynamic pages, because storeCGI is false
081:                    // and the URL contains a "?"
082:                    return;
083:                }
084:
085:                String filename = url2Filename(doc.getURL());
086:                if (doc.getContent().length >= minFileSize) {
087:                    try {
088:                        createDirs(filename);
089:                        BufferedOutputStream os = new BufferedOutputStream(
090:                                new FileOutputStream(filename));
091:                        os.write(doc.getContent());
092:                        os.flush();
093:                        os.close();
094:                    } catch (IOException e) {
095:                        throw new DocManagerException(e.getMessage());
096:                    }
097:                }
098:            }
099:
100:            /**
101:             * Gets the cacheFile of the given URL if its document was stored.
102:             * @param url
103:             * @return cacheFile
104:             */
105:            protected File getCacheFile(URL url) {
106:                // does the file exists on the filesystem ?
107:                File cacheFile = new File(url2Filename(url));
108:                if (!(cacheFile.exists() && (cacheFile.isFile()))) {
109:                    return null;
110:                }
111:                return cacheFile;
112:            }
113:
114:            /**
115:             * Gets the extension of the given URL if its document was stored.
116:             * @param url
117:             * @return String
118:             */
119:            protected String getExtension(URL url) {
120:                // is it dynamic ?
121:                if ((url.toString().indexOf('?') >= 0)
122:                        || (url.toString().indexOf("cgi") >= 0)) {
123:                    return null;
124:                }
125:
126:                // do we have an filename extension ?
127:                // without it is not possible to guess the MIME type.
128:                String path = url.getPath();
129:                String ext = null;
130:
131:                if (path.indexOf(".") < 0) {
132:                    return null;
133:                }
134:
135:                StringTokenizer st = new StringTokenizer(path, ".");
136:                while (st.hasMoreTokens()) {
137:                    ext = st.nextToken();
138:                }
139:                // no extension if ext contains a "/"
140:                if (ext.indexOf("/") >= 0) {
141:                    return null;
142:                }
143:
144:                return ext;
145:            }
146:
147:            /**
148:             * Removes a document that was stored previous from the file system. Because
149:             * the HttpDocToFile does not store the HTTP headers, only the Content-Type
150:             * header will exists. Even this header may not be correct. It will only use a
151:             * simple heuristic to determine the possible MIME type.
152:             */
153:            public void removeDocument(URL u) {
154:                String ext = getExtension(u);
155:                if (ext == null)
156:                    return;
157:                File cacheFile = getCacheFile(u);
158:                if (cacheFile == null)
159:                    return;
160:
161:                cacheFile.delete();
162:            }
163:
164:            /**
165:             * Gets a document that was stored previous from the file system.
166:             * Because the HttpDocToFile does not store the HTTP headers, only
167:             * the Content-Type header will exists. Even this header may not 
168:             * be correct. It will only use a simple heuristic to determine the
169:             * possible MIME type.
170:             *
171:             * @return null, if this document was not stored before or it seems
172:             * to be a dynamic document.
173:             */
174:            public HttpDoc retrieveFromCache(URL u) {
175:                String ext = getExtension(u);
176:                if (ext == null)
177:                    return null;
178:                File cacheFile = getCacheFile(u);
179:                if (cacheFile == null)
180:                    return null;
181:
182:                // create a buffer;
183:                long size = cacheFile.length();
184:                if (size > Integer.MAX_VALUE) {
185:                    log.info("File too large");
186:                    return null;
187:                }
188:
189:                byte[] buff = new byte[(int) size];
190:
191:                // read the file
192:                try {
193:                    FileInputStream fi = new FileInputStream(cacheFile);
194:                    fi.read(buff);
195:                } catch (IOException e) {
196:                    log
197:                            .info("Could not read cached document "
198:                                    + e.getMessage());
199:                    return null;
200:                }
201:
202:                // create a new HttpDoc object
203:                HttpDoc doc = new HttpDoc();
204:
205:                // and set the content and the header
206:                doc.setHttpCode("HTTP/1.0 200 OK");
207:                doc.setContent(buff);
208:
209:                // now guess the MIME type
210:                String mimetype = null;
211:
212:                if (ext.equals("html") || ext.equals("htm")
213:                        || ext.equals("shtml") || ext.equals("asp")
214:                        || ext.equals("php") || ext.equals("jsp")) {
215:                    mimetype = "text/html";
216:                } else {
217:                    mimetype = "application/unknown";
218:                }
219:
220:                doc.addHeader(new HttpHeader("Content-Type", mimetype));
221:                doc.setURL(u);
222:                doc.setCached(true);
223:
224:                return doc;
225:            }
226:
227:            /**
228:             * gets the value of baseDir
229:             * @return the value of baseDir
230:             */
231:            public String getBaseDir() {
232:                return baseDir;
233:            }
234:
235:            /**
236:             * sets the value of basedir
237:             * @param baseDir the new value of baseDir
238:             */
239:            public void setBaseDir(String baseDir) {
240:                this .baseDir = baseDir;
241:            }
242:
243:            /**
244:             * converts an URL to a filename http://host/path will 
245:             * be converted to basedir/host/path
246:             * @param URL a URL to convert, must not be null
247:             * @return a pathname
248:             */
249:            protected String url2Filename(URL u) {
250:                StringBuffer sb = new StringBuffer();
251:
252:                sb.append(baseDir);
253:                sb.append(File.separatorChar);
254:                sb.append(u.getHost());
255:                sb.append(u.getFile());
256:
257:                // is there a query part ?
258:                // that is something after the file name seperated by ?
259:                String query = u.getQuery();
260:                if ((query != null) && (!query.equals(""))) {
261:                    sb.append(File.separatorChar);
262:                    sb.append(query);
263:                }
264:
265:                // filename that ends with /
266:                // are directories, we will name the file "index.html"
267:                if (sb.charAt(sb.length() - 1) == '/') {
268:                    sb.append("index.html");
269:                }
270:
271:                // postprocess filename (replace special characters)
272:                for (int i = 0; i < sb.length(); i++) {
273:                    char c = sb.charAt(i);
274:                    char newc = (char) 0;
275:
276:                    // replace / by operating system file name separator
277:                    if (c == '/') {
278:                        newc = File.separatorChar;
279:                    }
280:
281:                    // replace special characters from CGIs
282:                    if (replaceAllSpecials) {
283:                        if ((c == '?') || (c == '=') || (c == '&')) {
284:                            newc = '-';
285:                        }
286:                    }
287:
288:                    if ((newc != (char) 0) && (newc != c)) {
289:                        sb.setCharAt(i, newc);
290:                    }
291:                }
292:
293:                return sb.toString();
294:            }
295:
296:            /** 
297:             * creates all directories that are needed to place the 
298:             * file filename if they don't exists 
299:             * @param filename the full path name of a file
300:             */
301:            protected void createDirs(String filename) throws IOException {
302:                int pos = -1;
303:                // look for the last directory separator in the filename
304:                for (int i = filename.length() - 1; i >= 0; i--) {
305:                    if (filename.charAt(i) == File.separatorChar) {
306:                        pos = i;
307:                        i = -1;
308:                    }
309:                }
310:                File dir = new File(filename.substring(0, pos));
311:                dir.mkdirs();
312:            }
313:
314:            /**
315:             * gets the value of minFileSize. Files smaller then this size
316:             * (in Bytes) will not be saved to disk !
317:             * @return the value of minFileSize 
318:             */
319:            public int getMinFileSize() {
320:                return minFileSize;
321:            }
322:
323:            /**
324:             * sets the value of minFileSize
325:             * @param minFileSize the new value of minFileSize
326:             * @see #getMinFileSize()
327:             */
328:            public void setMinFileSize(int minFileSize) {
329:                this .minFileSize = minFileSize;
330:            }
331:
332:            /**
333:             * Get the value of replaceAllSpecials.
334:             *
335:             * if replaceAllSpecials is true, all sepcial characters in the URL
336:             * will be replaced by "-". This is useful for operating system that
337:             * can't handle files with special characters in the filename (e.g.
338:             * Windows)
339:             *
340:             * @return value of replaceAllSpecials.
341:             */
342:            public boolean isReplaceAllSpecials() {
343:                return replaceAllSpecials;
344:            }
345:
346:            /**
347:             * Set the value of replaceAllSpecials.
348:             *
349:             * if replaceAllSpecials is true, all sepcial characters in the URL
350:             * will be replaced by "-". This is useful for operating system that
351:             * can't handle files with special characters in the filename (e.g.
352:             * Windows)
353:             *
354:             * @param v  Value to assign to replaceAllSpecials.
355:             */
356:            public void setReplaceAllSpecials(boolean v) {
357:                this .replaceAllSpecials = v;
358:            }
359:
360:            /**
361:             * Get the value of storeCGI
362:             *
363:             * If this is true, the object will store ALL retrieved documents,
364:             * otherwise it will store only documents from URLs that do not
365:             * have a "?" in the URL
366:             */
367:            public boolean getStoreCGI() {
368:                return storeCGI;
369:            }
370:
371:            /**
372:             * Set the value of storeCGI.
373:             *
374:             * If this is true, the object will store ALL retrieved documents,
375:             * otherwise it will store only documents from URLs that do not
376:             * have a "?" in the URL
377:             *
378:             * @param v  Value to assign to storeCGI.
379:             */
380:            public void setStoreCGI(boolean v) {
381:                this.storeCGI = v;
382:            }
383:
384:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.