Source Code Cross Referenced for ExperimentalWARCWriter.java in  » Web-Crawler » heritrix » org » archive » io » warc » v10 » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.warc.v10 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*  $Id: ExperimentalWARCWriter.java 4604 2006-09-06 05:38:18Z stack-sf $
002:         *
003:         * Created on July 27th, 2006
004:         *
005:         * Copyright (C) 2006 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.io.warc.v10;
024:
025:        import java.io.ByteArrayInputStream;
026:        import java.io.ByteArrayOutputStream;
027:        import java.io.File;
028:        import java.io.IOException;
029:        import java.io.InputStream;
030:        import java.io.OutputStream;
031:        import java.net.URI;
032:        import java.net.URISyntaxException;
033:        import java.text.DecimalFormat;
034:        import java.text.NumberFormat;
035:        import java.util.Iterator;
036:        import java.util.List;
037:        import java.util.Map;
038:        import java.util.concurrent.atomic.AtomicInteger;
039:
040:        import org.archive.io.UTF8Bytes;
041:        import org.archive.io.WriterPoolMember;
042:        import org.archive.io.warc.WARCConstants;
043:        import org.archive.uid.GeneratorFactory;
044:        import org.archive.util.ArchiveUtils;
045:        import org.archive.util.anvl.ANVLRecord;
046:
047:        /**
048:         * <b>Experimental</b> WARC implementation.
049:         * 
050:         * Based on unreleased version 0.9 of <a 
051:         * href="http://archive-access.sourceforge.net//warc/warc_file_format.html">WARC
052:         * File Format</a> document.  Specification and implementation subject to
053:         * change.
054:         *
055:         * <p>Assumption is that the caller is managing access to this
056:         * ExperimentalWARCWriter ensuring only one thread accessing this WARC instance
057:         * at any one time.
058:         * 
059:         * <p>While being written, WARCs have a '.open' suffix appended.
060:         *
061:         * @author stack
062:         * @version $Revision: 4604 $ $Date: 2006-09-05 22:38:18 -0700 (Tue, 05 Sep 2006) $
063:         */
064:        public class ExperimentalWARCWriter extends WriterPoolMember implements 
065:                WARCConstants {
066:            /**
067:             * Buffer to reuse writing streams.
068:             */
069:            private final byte[] readbuffer = new byte[16 * 1024];
070:
071:            /**
072:             * NEWLINE as bytes.
073:             */
074:            public static byte[] CRLF_BYTES;
075:            static {
076:                try {
077:                    CRLF_BYTES = CRLF.getBytes(DEFAULT_ENCODING);
078:                } catch (Exception e) {
079:                    e.printStackTrace();
080:                }
081:            };
082:
083:            /**
084:             * Formatter for the length.
085:             */
086:            private static NumberFormat RECORD_LENGTH_FORMATTER = new DecimalFormat(
087:                    PLACEHOLDER_RECORD_LENGTH_STRING);
088:
089:            /**
090:             * Metadata.
091:             * TODO: Exploit writing warcinfo record.  Currently unused.
092:             */
093:            private final List fileMetadata;
094:
095:            /**
096:             * Shutdown Constructor
097:             * Has default access so can make instance to test utility methods.
098:             */
099:            ExperimentalWARCWriter() {
100:                this (null, null, "", "", true, -1, null);
101:            }
102:
103:            /**
104:             * Constructor.
105:             * Takes a stream. Use with caution. There is no upperbound check on size.
106:             * Will just keep writing.  Only pass Streams that are bounded. 
107:             * @param serialNo  used to generate unique file name sequences
108:             * @param out Where to write.
109:             * @param f File the <code>out</code> is connected to.
110:             * @param cmprs Compress the content written.
111:             * @param a14DigitDate If null, we'll write current time.
112:             * @throws IOException
113:             */
114:            public ExperimentalWARCWriter(final AtomicInteger serialNo,
115:                    final OutputStream out, final File f, final boolean cmprs,
116:                    final String a14DigitDate, final List warcinfoData)
117:                    throws IOException {
118:                super (serialNo, out, f, cmprs, a14DigitDate);
119:                // TODO: Currently unused.
120:                this .fileMetadata = warcinfoData;
121:            }
122:
123:            /**
124:             * Constructor.
125:             *
126:             * @param dirs Where to drop files.
127:             * @param prefix File prefix to use.
128:             * @param cmprs Compress the records written. 
129:             * @param maxSize Maximum size for ARC files written.
130:             * @param suffix File tail to use.  If null, unused.
131:             * @param warcinfoData File metadata for warcinfo record.
132:             */
133:            public ExperimentalWARCWriter(final AtomicInteger serialNo,
134:                    final List<File> dirs, final String prefix,
135:                    final String suffix, final boolean cmprs,
136:                    final long maxSize, final List warcinfoData) {
137:                super (serialNo, dirs, prefix, suffix, cmprs, maxSize,
138:                        WARC_FILE_EXTENSION);
139:                // TODO: Currently unused.
140:                this .fileMetadata = warcinfoData;
141:            }
142:
143:            @Override
144:            protected String createFile(File file) throws IOException {
145:                String filename = super .createFile(file);
146:                writeWarcinfoRecord(filename);
147:                return filename;
148:            }
149:
150:            protected void baseCharacterCheck(final char c,
151:                    final String parameter) throws IOException {
152:                // TODO: Too strict?  UNICODE control characters?
153:                if (Character.isISOControl(c) || !Character.isValidCodePoint(c)) {
154:                    throw new IOException("Contains illegal character 0x"
155:                            + Integer.toHexString(c) + ": " + parameter);
156:                }
157:            }
158:
159:            protected String checkHeaderLineParameters(final String parameter)
160:                    throws IOException {
161:                for (int i = 0; i < parameter.length(); i++) {
162:                    final char c = parameter.charAt(i);
163:                    baseCharacterCheck(c, parameter);
164:                    if (Character.isWhitespace(c)) {
165:                        throw new IOException(
166:                                "Contains disallowed white space 0x"
167:                                        + Integer.toHexString(c) + ": "
168:                                        + parameter);
169:                    }
170:                }
171:                return parameter;
172:            }
173:
174:            protected String checkHeaderLineMimetypeParameter(
175:                    final String parameter) throws IOException {
176:                StringBuilder sb = new StringBuilder(parameter.length());
177:                boolean wasWhitespace = false;
178:                for (int i = 0; i < parameter.length(); i++) {
179:                    char c = parameter.charAt(i);
180:                    if (Character.isWhitespace(c)) {
181:                        // Map all to ' ' and collapse multiples into one.
182:                        // TODO: Make sure white space occurs in legal location --
183:                        // before parameter or inside quoted-string.
184:                        if (wasWhitespace) {
185:                            continue;
186:                        }
187:                        wasWhitespace = true;
188:                        c = ' ';
189:                    } else {
190:                        wasWhitespace = false;
191:                        baseCharacterCheck(c, parameter);
192:                    }
193:                    sb.append(c);
194:                }
195:
196:                return sb.toString();
197:            }
198:
199:            protected byte[] createRecordHeaderline(final String type,
200:                    final String url, final String create14DigitDate,
201:                    final String mimetype, final URI recordId,
202:                    final int namedFieldsLength, final long contentLength)
203:                    throws IOException {
204:                final StringBuilder sb = new StringBuilder(2048/*A SWAG: TODO: Do analysis.*/);
205:                sb.append(WARC_010_ID);
206:                sb.append(HEADER_FIELD_SEPARATOR);
207:                sb.append(PLACEHOLDER_RECORD_LENGTH_STRING);
208:                sb.append(HEADER_FIELD_SEPARATOR);
209:                sb.append(type);
210:                sb.append(HEADER_FIELD_SEPARATOR);
211:                sb.append(checkHeaderLineParameters(url));
212:                sb.append(HEADER_FIELD_SEPARATOR);
213:                sb.append(checkHeaderLineParameters(create14DigitDate));
214:                sb.append(HEADER_FIELD_SEPARATOR);
215:                // 0.9 of spec. has mimetype second-to-last and recordid last on
216:                // header line.  Here we swap their positions and allow writing
217:                // of full mimetypes rather than the curtailed type we used write into
218:                // ARCs.  These two deviations to be proposed as amendments to spec 0.9.
219:                sb.append(checkHeaderLineParameters(recordId.toString()));
220:                sb.append(HEADER_FIELD_SEPARATOR);
221:                sb.append(checkHeaderLineMimetypeParameter(mimetype));
222:                // Add terminating CRLF.
223:                sb.append(CRLF);
224:
225:                long length = sb.length() + namedFieldsLength + contentLength;
226:
227:                // Insert length and pad out to fixed width with zero prefix to
228:                // highlight 'fixed-widthness' of length.
229:                int start = WARC_010_ID.length() + 1 /*HEADER_FIELD_SEPARATOR */;
230:                int end = start + PLACEHOLDER_RECORD_LENGTH_STRING.length();
231:                String lenStr = RECORD_LENGTH_FORMATTER.format(length);
232:                sb.replace(start, end, lenStr);
233:
234:                return sb.toString().getBytes(HEADER_LINE_ENCODING);
235:            }
236:
237:            protected void writeRecord(final String type, final String url,
238:                    final String create14DigitDate, final String mimetype,
239:                    final URI recordId, ANVLRecord namedFields,
240:                    final InputStream contentStream, final long contentLength)
241:                    throws IOException {
242:                if (!TYPES_LIST.contains(type)) {
243:                    throw new IllegalArgumentException("Unknown record type: "
244:                            + type);
245:                }
246:                if (contentLength == 0
247:                        && (namedFields == null || namedFields.size() <= 0)) {
248:                    throw new IllegalArgumentException(
249:                            "Cannot have a record made "
250:                                    + "of a Header line only (Content and Named Fields are empty).");
251:                }
252:
253:                preWriteRecordTasks();
254:                try {
255:                    if (namedFields == null) {
256:                        // Use the empty anvl record so the length of blank line on
257:                        // end gets counted as part of the record length.
258:                        namedFields = ANVLRecord.EMPTY_ANVL_RECORD;
259:                    }
260:
261:                    // Serialize metadata first so we have metadata length.
262:                    final byte[] namedFieldsBlock = namedFields.getUTF8Bytes();
263:                    // Now serialize the Header line.
264:                    final byte[] header = createRecordHeaderline(type, url,
265:                            create14DigitDate, mimetype, recordId,
266:                            namedFieldsBlock.length, contentLength);
267:                    write(header);
268:                    write(namedFieldsBlock);
269:                    if (contentStream != null && contentLength > 0) {
270:                        readFullyFrom(contentStream, contentLength,
271:                                this .readbuffer);
272:                    }
273:
274:                    // Write out the two blank lines at end of all records.
275:                    // TODO: Why? Messes up skipping through file. Also not in grammar.
276:                    write(CRLF_BYTES);
277:                    write(CRLF_BYTES);
278:                } finally {
279:                    postWriteRecordTasks();
280:                }
281:            }
282:
283:            protected URI generateRecordId(final Map<String, String> qualifiers)
284:                    throws IOException {
285:                URI rid = null;
286:                try {
287:                    rid = GeneratorFactory.getFactory().getQualifiedRecordID(
288:                            qualifiers);
289:                } catch (URISyntaxException e) {
290:                    // Convert to IOE so can let it out.
291:                    throw new IOException(e.getMessage());
292:                }
293:                return rid;
294:            }
295:
296:            protected URI generateRecordId(final String key, final String value)
297:                    throws IOException {
298:                URI rid = null;
299:                try {
300:                    rid = GeneratorFactory.getFactory().getQualifiedRecordID(
301:                            key, value);
302:                } catch (URISyntaxException e) {
303:                    // Convert to IOE so can let it out.
304:                    throw new IOException(e.getMessage());
305:                }
306:                return rid;
307:            }
308:
309:            public URI writeWarcinfoRecord(String filename) throws IOException {
310:                return writeWarcinfoRecord(filename, null);
311:            }
312:
313:            public URI writeWarcinfoRecord(String filename,
314:                    final String description) throws IOException {
315:                // Strip .open suffix if present.
316:                if (filename.endsWith(WriterPoolMember.OCCUPIED_SUFFIX)) {
317:                    filename = filename.substring(0, filename.length()
318:                            - WriterPoolMember.OCCUPIED_SUFFIX.length());
319:                }
320:                ANVLRecord record = new ANVLRecord(2);
321:                record.addLabelValue(NAMED_FIELD_WARCFILENAME, filename);
322:                if (description != null && description.length() > 0) {
323:                    record.addLabelValue(NAMED_FIELD_DESCRIPTION, description);
324:                }
325:                // Add warcinfo body.
326:                byte[] warcinfoBody = null;
327:                if (this .fileMetadata == null) {
328:                    // TODO: What to write into a warcinfo?  What to associate?
329:                    warcinfoBody = "TODO: Unimplemented".getBytes();
330:                } else {
331:                    ByteArrayOutputStream baos = new ByteArrayOutputStream();
332:                    for (final Iterator i = this .fileMetadata.iterator(); i
333:                            .hasNext();) {
334:                        baos
335:                                .write(i.next().toString().getBytes(
336:                                        UTF8Bytes.UTF8));
337:                    }
338:                    warcinfoBody = baos.toByteArray();
339:                }
340:                URI uri = writeWarcinfoRecord("text/plain", record,
341:                        new ByteArrayInputStream(warcinfoBody),
342:                        warcinfoBody.length);
343:                // TODO: If at start of file, and we're writing compressed,
344:                // write out our distinctive GZIP extensions.
345:                return uri;
346:            }
347:
348:            /**
349:             * Write a warcinfo to current file.
350:             * TODO: Write crawl metadata or pointers to crawl description.
351:             * @param mimetype Mimetype of the <code>fileMetadata</code> block.
352:             * @param namedFields Named fields. Pass <code>null</code> if none.
353:             * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
354:             * @param fileMetadataLength Length of <code>fileMetadata</code>.
355:             * @throws IOException
356:             * @return Generated record-id made with
357:             * <a href="http://en.wikipedia.org/wiki/Data:_URL">data: scheme</a> and
358:             * the current filename.
359:             */
360:            public URI writeWarcinfoRecord(final String mimetype,
361:                    final ANVLRecord namedFields,
362:                    final InputStream fileMetadata,
363:                    final long fileMetadataLength) throws IOException {
364:                final URI recordid = generateRecordId(TYPE, WARCINFO);
365:                writeWarcinfoRecord(ArchiveUtils.get14DigitDate(), mimetype,
366:                        recordid, namedFields, fileMetadata, fileMetadataLength);
367:                return recordid;
368:            }
369:
370:            /**
371:             * Write a <code>warcinfo</code> to current file.
372:             * The <code>warcinfo</code> type uses its <code>recordId</code> as its URL.
373:             * @param recordId URI to use for this warcinfo.
374:             * @param create14DigitDate Record creation date as 14 digit date.
375:             * @param mimetype Mimetype of the <code>fileMetadata</code>.
376:             * @param namedFields Named fields.
377:             * @param fileMetadata Metadata about this WARC as RDF, ANVL, etc.
378:             * @param fileMetadataLength Length of <code>fileMetadata</code>.
379:             * @throws IOException
380:             */
381:            public void writeWarcinfoRecord(final String create14DigitDate,
382:                    final String mimetype, final URI recordId,
383:                    final ANVLRecord namedFields,
384:                    final InputStream fileMetadata,
385:                    final long fileMetadataLength) throws IOException {
386:                writeRecord(WARCINFO, recordId.toString(), create14DigitDate,
387:                        mimetype, recordId, namedFields, fileMetadata,
388:                        fileMetadataLength);
389:            }
390:
391:            public void writeRequestRecord(final String url,
392:                    final String create14DigitDate, final String mimetype,
393:                    final URI recordId, final ANVLRecord namedFields,
394:                    final InputStream request, final long requestLength)
395:                    throws IOException {
396:                writeRecord(REQUEST, url, create14DigitDate, mimetype,
397:                        recordId, namedFields, request, requestLength);
398:            }
399:
400:            public void writeResourceRecord(final String url,
401:                    final String create14DigitDate, final String mimetype,
402:                    final ANVLRecord namedFields, final InputStream response,
403:                    final long responseLength) throws IOException {
404:                writeResourceRecord(url, create14DigitDate, mimetype,
405:                        getRecordID(), namedFields, response, responseLength);
406:            }
407:
408:            public void writeResourceRecord(final String url,
409:                    final String create14DigitDate, final String mimetype,
410:                    final URI recordId, final ANVLRecord namedFields,
411:                    final InputStream response, final long responseLength)
412:                    throws IOException {
413:                writeRecord(RESOURCE, url, create14DigitDate, mimetype,
414:                        recordId, namedFields, response, responseLength);
415:            }
416:
417:            public void writeResponseRecord(final String url,
418:                    final String create14DigitDate, final String mimetype,
419:                    final URI recordId, final ANVLRecord namedFields,
420:                    final InputStream response, final long responseLength)
421:                    throws IOException {
422:                writeRecord(RESPONSE, url, create14DigitDate, mimetype,
423:                        recordId, namedFields, response, responseLength);
424:            }
425:
426:            public void writeMetadataRecord(final String url,
427:                    final String create14DigitDate, final String mimetype,
428:                    final URI recordId, final ANVLRecord namedFields,
429:                    final InputStream metadata, final long metadataLength)
430:                    throws IOException {
431:                writeRecord(METADATA, url, create14DigitDate, mimetype,
432:                        recordId, namedFields, metadata, metadataLength);
433:            }
434:
435:            /**
436:             * Convenience method for getting Record-Ids.
437:             * @return A record ID.
438:             * @throws IOException
439:             */
440:            public static URI getRecordID() throws IOException {
441:                URI result;
442:                try {
443:                    result = GeneratorFactory.getFactory().getRecordID();
444:                } catch (URISyntaxException e) {
445:                    throw new IOException(e.toString());
446:                }
447:                return result;
448:            }
449:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.