Source Code Cross Referenced for ExperimentalWARCWriter.java in  » Web-Crawler » heritrix » org » archive » io » warc » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.warc 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*  $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $
002:         *
003:         * Created on July 27th, 2006
004:         *
005:         * Copyright (C) 2006 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.io.warc;
024:
025:        import java.io.ByteArrayInputStream;
026:        import java.io.ByteArrayOutputStream;
027:        import java.io.File;
028:        import java.io.IOException;
029:        import java.io.InputStream;
030:        import java.io.OutputStream;
031:        import java.net.URI;
032:        import java.net.URISyntaxException;
033:        import java.util.Iterator;
034:        import java.util.List;
035:        import java.util.Map;
036:        import java.util.concurrent.atomic.AtomicInteger;
037:
038:        import org.archive.io.UTF8Bytes;
039:        import org.archive.io.WriterPoolMember;
040:        import org.archive.uid.GeneratorFactory;
041:        import org.archive.util.ArchiveUtils;
042:        import org.archive.util.anvl.ANVLRecord;
043:
044:        /**
045:         * <b>Experimental</b> WARC implementation.
046:         *
047:         * <p>Assumption is that the caller is managing access to this
048:         * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
049:         * at any one time.
050:         * 
051:         * <p>While being written, WARCs have a '.open' suffix appended.
052:         *
053:         * @author stack
054:         * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
055:         */
056:        public class ExperimentalWARCWriter extends WriterPoolMember implements 
057:                WARCConstants {
058:            /**
059:             * Buffer to reuse writing streams.
060:             */
061:            private final byte[] readbuffer = new byte[16 * 1024];
062:
063:            /**
064:             * NEWLINE as bytes.
065:             */
066:            public static byte[] CRLF_BYTES;
067:            static {
068:                try {
069:                    CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
070:                } catch (Exception e) {
071:                    e.printStackTrace();
072:                }
073:            };
074:
075:            /**
076:             * Metadata.
077:             * TODO: Exploit writing warcinfo record.  Currently unused.
078:             */
079:            private final List fileMetadata;
080:
081:            /**
082:             * Shutdown Constructor
083:             * Has default access so can make instance to test utility methods.
084:             */
085:            ExperimentalWARCWriter() {
086:                this (null, null, "", "", true, -1, null);
087:            }
088:
089:            /**
090:             * Constructor.
091:             * Takes a stream. Use with caution. There is no upperbound check on size.
092:             * Will just keep writing.  Only pass Streams that are bounded. 
093:             * @param serialNo  used to generate unique file name sequences
094:             * @param out Where to write.
095:             * @param f File the <code>out</code> is connected to.
096:             * @param cmprs Compress the content written.
097:             * @param a14DigitDate If null, we'll write current time.
098:             * @throws IOException
099:             */
100:            public ExperimentalWARCWriter(final AtomicInteger serialNo,
101:                    final OutputStream out, final File f, final boolean cmprs,
102:                    final String a14DigitDate, final List warcinfoData)
103:                    throws IOException {
104:                super (serialNo, out, f, cmprs, a14DigitDate);
105:                this .fileMetadata = warcinfoData;
106:            }
107:
108:            /**
109:             * Constructor.
110:             *
111:             * @param dirs Where to drop files.
112:             * @param prefix File prefix to use.
113:             * @param cmprs Compress the records written. 
114:             * @param maxSize Maximum size for ARC files written.
115:             * @param suffix File tail to use.  If null, unused.
116:             * @param warcinfoData File metadata for warcinfo record.
117:             */
118:            public ExperimentalWARCWriter(final AtomicInteger serialNo,
119:                    final List<File> dirs, final String prefix,
120:                    final String suffix, final boolean cmprs,
121:                    final long maxSize, final List warcinfoData) {
122:                super (serialNo, dirs, prefix, suffix, cmprs, maxSize,
123:                        WARC_FILE_EXTENSION);
124:                this .fileMetadata = warcinfoData;
125:            }
126:
127:            @Override
128:            protected String createFile(File file) throws IOException {
129:                String filename = super .createFile(file);
130:                writeWarcinfoRecord(filename);
131:                return filename;
132:            }
133:
134:            protected void baseCharacterCheck(final char c,
135:                    final String parameter) throws IOException {
136:                // TODO: Too strict?  UNICODE control characters?
137:                if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
138:                    throw new IOException("Contains illegal character 0x"
139:                            + Integer.toHexString(c) + ": " + parameter);
140:                }
141:            }
142:
143:            protected String checkHeaderValue(final String value)
144:                    throws IOException {
145:                for (int i = 0; i < value.length(); i++) {
146:                    final char c = value.charAt(i);
147:                    baseCharacterCheck(c, value);
148:                    if (Character.isWhitespace(c)) {
149:                        throw new IOException(
150:                                "Contains disallowed white space 0x"
151:                                        + Integer.toHexString(c) + ": " + value);
152:                    }
153:                }
154:                return value;
155:            }
156:
157:            protected String checkHeaderLineMimetypeParameter(
158:                    final String parameter) throws IOException {
159:                StringBuilder sb = new StringBuilder(parameter.length());
160:                boolean wasWhitespace = false;
161:                for (int i = 0; i < parameter.length(); i++) {
162:                    char c = parameter.charAt(i);
163:                    if (Character.isWhitespace(c)) {
164:                        // Map all to ' ' and collapse multiples into one.
165:                        // TODO: Make sure white space occurs in legal location --
166:                        // before parameter or inside quoted-string.
167:                        if (wasWhitespace) {
168:                            continue;
169:                        }
170:                        wasWhitespace = true;
171:                        c = ' ';
172:                    } else {
173:                        wasWhitespace = false;
174:                        baseCharacterCheck(c, parameter);
175:                    }
176:                    sb.append(c);
177:                }
178:
179:                return sb.toString();
180:            }
181:
182:            protected String createRecordHeader(final String type,
183:                    final String url, final String create14DigitDate,
184:                    final String mimetype, final URI recordId,
185:                    final ANVLRecord xtraHeaders, final long contentLength)
186:                    throws IOException {
187:                final StringBuilder sb = new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
188:                sb.append(WARC_ID).append(CRLF);
189:                sb.append(HEADER_KEY_TYPE).append(COLON_SPACE).append(type)
190:                        .append(CRLF);
191:                // Do not write a subject-uri if not one present.
192:                if (url != null && url.length() > 0) {
193:                    sb.append(HEADER_KEY_URI).append(COLON_SPACE).append(
194:                            checkHeaderValue(url)).append(CRLF);
195:                }
196:                sb.append(HEADER_KEY_DATE).append(COLON_SPACE).append(
197:                        create14DigitDate).append(CRLF);
198:                if (xtraHeaders != null) {
199:                    for (final Iterator i = xtraHeaders.iterator(); i.hasNext();) {
200:                        sb.append(i.next()).append(CRLF);
201:                    }
202:                }
203:
204:                // TODO: Is MIME Version needed.
205:                sb.append(MIME_VERSION).append(CRLF);
206:                sb.append(CONTENT_ID).append(COLON_SPACE).append('<').append(
207:                        recordId.toString()).append('>').append(CRLF);
208:                if (contentLength > 0) {
209:                    sb.append(CONTENT_TYPE).append(COLON_SPACE).append(
210:                            checkHeaderLineMimetypeParameter(mimetype)).append(
211:                            CRLF);
212:                }
213:                sb.append(CONTENT_LENGTH).append(COLON_SPACE).append(
214:                        Long.toString(contentLength)).append(CRLF);
215:
216:                return sb.toString();
217:            }
218:
219:            protected void writeRecord(final String type, final String url,
220:                    final String create14DigitDate, final String mimetype,
221:                    final URI recordId, ANVLRecord xtraHeaders,
222:                    final InputStream contentStream, final long contentLength)
223:                    throws IOException {
224:                if (!TYPES_LIST.contains(type)) {
225:                    throw new IllegalArgumentException("Unknown record type: "
226:                            + type);
227:                }
228:                if (contentLength == 0
229:                        && (xtraHeaders == null || xtraHeaders.size() <= 0)) {
230:                    throw new IllegalArgumentException("Cannot write record "
231:                            + "of content-length zero and base headers only.");
232:                }
233:
234:                preWriteRecordTasks();
235:                try {
236:                    final String header = createRecordHeader(type, url,
237:                            create14DigitDate, mimetype, recordId, xtraHeaders,
238:                            contentLength);
239:                    // TODO: Revisit endcoding of header.
240:                    write(header.getBytes(WARC_HEADER_ENCODING));
241:
242:                    if (contentStream != null && contentLength > 0) {
243:                        // Write out the header/body separator.
244:                        write(CRLF_BYTES); // TODO: should this be written even for zero-length?
245:                        readToLimitFrom(contentStream, contentLength,
246:                                this .readbuffer);
247:                    }
248:
249:                    // Write out the two blank lines at end of all records.
250:                    // TODO: Why? Messes up skipping through file. Also not in grammar.
251:                    write(CRLF_BYTES);
252:                    write(CRLF_BYTES);
253:                } finally {
254:                    postWriteRecordTasks();
255:                }
256:            }
257:
258:            protected URI generateRecordId(final Map<String, String> qualifiers)
259:                    throws IOException {
260:                URI rid = null;
261:                try {
262:                    rid = GeneratorFactory.getFactory().getQualifiedRecordID(
263:                            qualifiers);
264:                } catch (URISyntaxException e) {
265:                    // Convert to IOE so can let it out.
266:                    throw new IOException(e.getMessage());
267:                }
268:                return rid;
269:            }
270:
271:            protected URI generateRecordId(final String key, final String value)
272:                    throws IOException {
273:                URI rid = null;
274:                try {
275:                    rid = GeneratorFactory.getFactory().getQualifiedRecordID(
276:                            key, value);
277:                } catch (URISyntaxException e) {
278:                    // Convert to IOE so can let it out.
279:                    throw new IOException(e.getMessage());
280:                }
281:                return rid;
282:            }
283:
284:            public URI writeWarcinfoRecord(String filename) throws IOException {
285:                return writeWarcinfoRecord(filename, null);
286:            }
287:
288:            public URI writeWarcinfoRecord(String filename,
289:                    final String description) throws IOException {
290:                // Strip .open suffix if present.
291:                if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
292:                    filename = filename.substring(0, filename.length()
293:                            - WriterPoolMember.OCCUPIED_SUFFIX.length());
294:                }
295:                ANVLRecord record = new ANVLRecord(2);
296:                record.addLabelValue(HEADER_KEY_FILENAME, filename);
297:                if (description != null && description.length() > 0) {
298:                    record.addLabelValue(CONTENT_DESCRIPTION, description);
299:                }
300:                // Add warcinfo body.
301:                byte[] warcinfoBody = null;
302:                if (this .fileMetadata == null) {
303:                    // TODO: What to write into a warcinfo?  What to associate?
304:                    warcinfoBody = "TODO: Unimplemented".getBytes();
305:                } else {
306:                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
307:                    for (final Iterator i = this .fileMetadata.iterator(); i
308:                            .hasNext();) {
309:                        baos
310:                                .write(i.next().toString().getBytes(
311:                                        UTF8Bytes.UTF8));
312:                    }
313:                    warcinfoBody = baos.toByteArray();
314:                }
315:                URI uri = writeWarcinfoRecord("text/xml", record,
316:                        new ByteArrayInputStream(warcinfoBody),
317:                        warcinfoBody.length);
318:                // TODO: If at start of file, and we're writing compressed,
319:                // write out our distinctive GZIP extensions.
320:                return uri;
321:            }
322:
323:            /**
324:             * Write a warcinfo to current file.
325:             * TODO: Write crawl metadata or pointers to crawl description.
326:             * @param mimetype Mimetype of the <code>fileMetadata</code> block.
327:             * @param namedFields Named fields. Pass <code>null</code> if none.
328:             * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
329:             * @param fileMetadataLength Length of <code>fileMetadata</code>.
330:             * @throws IOException
331:             * @return Generated record-id made with
332:             * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
333:             * the current filename.
334:             */
335:            public URI writeWarcinfoRecord(final String mimetype,
336:                    final ANVLRecord namedFields,
337:                    final InputStream fileMetadata,
338:                    final long fileMetadataLength) throws IOException {
339:                final URI recordid = generateRecordId(TYPE, WARCINFO);
340:                writeWarcinfoRecord(ArchiveUtils.getLog14Date(), mimetype,
341:                        recordid, namedFields, fileMetadata, fileMetadataLength);
342:                return recordid;
343:            }
344:
345:            /**
346:             * Write a <code>warcinfo</code> to current file.
347:             * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
348:             * @param recordId URI to use for this warcinfo.
349:             * @param create14DigitDate Record creation date as 14 digit date.
350:             * @param mimetype Mimetype of the <code>fileMetadata</code>.
351:             * @param namedFields Named fields.
352:             * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
353:             * @param fileMetadataLength Length of <code>fileMetadata</code>.
354:             * @throws IOException
355:             */
356:            public void writeWarcinfoRecord(final String create14DigitDate,
357:                    final String mimetype, final URI recordId,
358:                    final ANVLRecord namedFields,
359:                    final InputStream fileMetadata,
360:                    final long fileMetadataLength) throws IOException {
361:                writeRecord(WARCINFO, null, create14DigitDate, mimetype,
362:                        recordId, namedFields, fileMetadata, fileMetadataLength);
363:            }
364:
365:            public void writeRequestRecord(final String url,
366:                    final String create14DigitDate, final String mimetype,
367:                    final URI recordId, final ANVLRecord namedFields,
368:                    final InputStream request, final long requestLength)
369:                    throws IOException {
370:                writeRecord(REQUEST, url, create14DigitDate, mimetype,
371:                        recordId, namedFields, request, requestLength);
372:            }
373:
374:            public void writeResourceRecord(final String url,
375:                    final String create14DigitDate, final String mimetype,
376:                    final ANVLRecord namedFields, final InputStream response,
377:                    final long responseLength) throws IOException {
378:                writeResourceRecord(url, create14DigitDate, mimetype,
379:                        getRecordID(), namedFields, response, responseLength);
380:            }
381:
382:            public void writeResourceRecord(final String url,
383:                    final String create14DigitDate, final String mimetype,
384:                    final URI recordId, final ANVLRecord namedFields,
385:                    final InputStream response, final long responseLength)
386:                    throws IOException {
387:                writeRecord(RESOURCE, url, create14DigitDate, mimetype,
388:                        recordId, namedFields, response, responseLength);
389:            }
390:
391:            public void writeResponseRecord(final String url,
392:                    final String create14DigitDate, final String mimetype,
393:                    final URI recordId, final ANVLRecord namedFields,
394:                    final InputStream response, final long responseLength)
395:                    throws IOException {
396:                writeRecord(RESPONSE, url, create14DigitDate, mimetype,
397:                        recordId, namedFields, response, responseLength);
398:            }
399:
400:            public void writeRevisitRecord(final String url,
401:                    final String create14DigitDate, final String mimetype,
402:                    final URI recordId, final ANVLRecord namedFields,
403:                    final InputStream response, final long responseLength)
404:                    throws IOException {
405:                writeRecord(REVISIT, url, create14DigitDate, mimetype,
406:                        recordId, namedFields, response, responseLength);
407:            }
408:
409:            public void writeMetadataRecord(final String url,
410:                    final String create14DigitDate, final String mimetype,
411:                    final URI recordId, final ANVLRecord namedFields,
412:                    final InputStream metadata, final long metadataLength)
413:                    throws IOException {
414:                writeRecord(METADATA, url, create14DigitDate, mimetype,
415:                        recordId, namedFields, metadata, metadataLength);
416:            }
417:
418:            /**
419:             * Convenience method for getting Record-Ids.
420:             * @return A record ID.
421:             * @throws IOException
422:             */
423:            public static URI getRecordID() throws IOException {
424:                URI result;
425:                try {
426:                    result = GeneratorFactory.getFactory().getRecordID();
427:                } catch (URISyntaxException e) {
428:                    throw new IOException(e.toString());
429:                }
430:                return result;
431:            }
432:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.