Source Code Cross Referenced for WARCRecord.java in » Web-Crawler » heritrix » org » archive » io » warc » v10 » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation

1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI

Java

Java Tutorial

Illustrator Tutorials

GIMP Tutorials

C# / C Sharp

C# / CSharp Tutorial

C# / CSharp Open Source

SQL Server / T-SQL Tutorial

Oracle PL / SQL

Oracle PL/SQL Tutorial

Flash / Flex / ActionScript

VBA / Excel / Access / Word

XML

XML Tutorial

Microsoft Office PowerPoint 2007 Tutorial

Microsoft Office Excel 2007 Tutorial

Microsoft Office Word 2007 Tutorial

Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.io.warc.v10

Source Cross Referenced Class Diagram Java Document (Java Doc)

001:        /* $Id: WARCRecord.java 4566 2006-08-31 16:51:41Z stack-sf $
002:         *
003:         * Created on August 25th, 2006
004:         *
005:         * Copyright (C) 2006 Internet Archive.
006:         *
007:         * This file is part of the Heritrix web crawler (crawler.archive.org).
008:         *
009:         * Heritrix is free software; you can redistribute it and/or modify
010:         * it under the terms of the GNU Lesser Public License as published by
011:         * the Free Software Foundation; either version 2.1 of the License, or
012:         * any later version.
013:         *
014:         * Heritrix is distributed in the hope that it will be useful,
015:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
016:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017:         * GNU Lesser Public License for more details.
018:         *
019:         * You should have received a copy of the GNU Lesser Public License
020:         * along with Heritrix; if not, write to the Free Software
021:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022:         */
023:        package org.archive.io.warc.v10;
024:
025:        import java.io.ByteArrayOutputStream;
026:        import java.io.IOException;
027:        import java.io.InputStream;
028:        import java.util.HashMap;
029:        import java.util.Map;
030:        import java.util.Set;
031:        import java.util.regex.Matcher;
032:        import java.util.regex.Pattern;
033:
034:        import org.archive.io.ArchiveRecord;
035:        import org.archive.io.ArchiveRecordHeader;
036:        import org.archive.io.warc.WARCConstants;
037:        import org.archive.util.LongWrapper;
038:        import org.archive.util.anvl.ANVLRecord;
039:
040:        /**
041:         * A WARC file Record.
042:         *
043:         * @author stack
044:         */
045:        public class WARCRecord extends ArchiveRecord implements  WARCConstants {
046:            /**
047:             * Header-Line pattern;
048:             * I heart http://www.fileformat.info/tool/regex.htm
049:             */
050:            private final static Pattern HEADER_LINE = Pattern
051:                    .compile("^WARC/([0-9]+\\.[0-9]+(?:\\.[0-9]+)?)"
052:                            + // Regex group 1: WARC lead-in.
053:                            "[\\t ]+"
054:                            + // Multiple tabs or spaces.
055:                            "([0-9]+)"
056:                            + // Regex group 2: Length.
057:                            "[\\t ]+"
058:                            + // Multiple tabs or spaces.
059:                            "(request|response|warcinfo|resource|metadata|"
060:                            + "revisit|conversion)" + // Regex group 3: Type of WARC Record.
061:                            "[\\t ]+" + // Multiple tabs or spaces.
062:                            "([^\\t ]+)" + // Regex group 4: Subject-uri.
063:                            "[\\t ]+" + // Multiple tabs or spaces.
064:                            "([0-9]{14})" + // Regex group 5: Date
065:                            "[\\t ]+" + // Multiple tabs or spaces.
066:                            "([^\\t ]+)" + // Regex group 6: Record-Id
067:                            "[\\t ]+" + // Multiple tabs or spaces.
068:                            "(.+)$"); // Regex group 7: Mimetype.
069:
070:            private Pattern WHITESPACE = Pattern.compile("\\s");
071:
072:            /**
073:             * Constructor.
074:             *
075:             * @param in Stream cue'd up to be at the start of the record this instance
076:             * is to represent.
077:             * @throws IOException
078:             */
079:            public WARCRecord(InputStream in, final String identifier,
080:                    final long offset) throws IOException {
081:                this (in, identifier, offset, true, false);
082:            }
083:
084:            /**
085:             * Constructor.
086:             * @param in Stream cue'd up just past Header Line and Named Fields.
087:             * @param headers Header Line and ANVL Named fields.
088:             * @throws IOException
089:             */
090:            public WARCRecord(InputStream in, ArchiveRecordHeader headers)
091:                    throws IOException {
092:                super (in, headers, 0, true, false);
093:            }
094:
095:            /**
096:             * Constructor.
097:             *
098:             * @param in Stream cue'd up to be at the start of the record this instance
099:             * is to represent or, if <code>headers</code> is not null, just past the
100:             * Header Line and Named Fields.
101:             * @param identifier Identifier for this the hosting Reader.
102:             * @param offset Current offset into <code>in</code> (Used to keep
103:             * <code>position</code> properly aligned).  Usually 0.
104:             * @param digest True if we're to calculate digest for this record.  Not
105:             * digesting saves about ~15% of cpu during parse.
106:             * @param strict Be strict parsing (Parsing stops if file inproperly
107:             * formatted).
108:             * @throws IOException
109:             */
110:            public WARCRecord(final InputStream in, final String identifier,
111:                    final long offset, boolean digest, boolean strict)
112:                    throws IOException {
113:                super (in, null, 0, digest, strict);
114:                setHeader(parseHeaders(in, identifier, offset, strict));
115:            }
116:
117:            /**
118:             * Parse WARC Header Line and Named Fields.
119:             * @param in Stream to read.
120:             * @param identifier Identifier for the hosting Reader.
121:             * @param offset Absolute offset into Reader.
122:             * @param strict Whether to be loose parsing or not.
123:             * @return An ArchiveRecordHeader.
124:             * @throws IOException 
125:             */
126:            protected ArchiveRecordHeader parseHeaders(final InputStream in,
127:                    final String identifier, final long offset,
128:                    final boolean strict) throws IOException {
129:                final Map<Object, Object> m = new HashMap<Object, Object>();
130:                m.put(ABSOLUTE_OFFSET_KEY, new Long(offset));
131:                m.put(READER_IDENTIFIER_FIELD_KEY, identifier);
132:                // Here we start reading off the inputstream but we're reading the
133:                // stream direct rather than going via WARCRecord#read.  The latter will
134:                // keep count of bytes read, digest and fail properly if EOR too soon...
135:                // We don't want digesting while reading Header Line and Named Fields.
136:                // 
137:                // The returned length includes terminating CRLF.
138:                int headLineLength = parseHeaderLine(in, m, strict);
139:
140:                // Now, doing the ANVL parse, hard to know how many bytes have been
141:                // read since passed Stream doesn't keep count and the ANVL parse can
142:                // throw away bytes (e.g. if white space padding at start of a folded
143:                // Value or if a Value has a newline in it and it gets converted to a
144:                // CRNL in the ANVL representation).  Wrap the stream in a
145:                // byte-counting stream.
146:                //
147:                // TODO: Buffering.  Currently, we rely on the deflate buffer when
148:                // file is gzipped.  Otherwise, if uncompressed, no buffering.
149:                final LongWrapper anvlParseLength = new LongWrapper(0);
150:                InputStream countingStream = new InputStream() {
151:                    @Override
152:                    public int read() throws IOException {
153:                        int c = in.read();
154:                        if (c != -1) {
155:                            anvlParseLength.longValue++;
156:                        }
157:                        return c;
158:                    }
159:                };
160:                parseNamedFields(countingStream, m);
161:                // Set offset at which content begins. Its the Header Line length plus
162:                // whatever we read parsing ANVL.
163:                final int contentOffset = (int) (headLineLength + anvlParseLength.longValue);
164:                incrementPosition(contentOffset);
165:
166:                return new ArchiveRecordHeader() {
167:                    private Map<Object, Object> fields = m;
168:                    private int contentBegin = contentOffset;
169:
170:                    public String getDate() {
171:                        return (String) this .fields.get(DATE_FIELD_KEY);
172:                    }
173:
174:                    public String getDigest() {
175:                        return (String) this .fields
176:                                .get(NAMED_FIELD_CHECKSUM_LABEL);
177:                    }
178:
179:                    public String getReaderIdentifier() {
180:                        return (String) this .fields
181:                                .get(READER_IDENTIFIER_FIELD_KEY);
182:                    }
183:
184:                    public Set getHeaderFieldKeys() {
185:                        return this .fields.keySet();
186:                    }
187:
188:                    public Map getHeaderFields() {
189:                        return this .fields;
190:                    }
191:
192:                    public Object getHeaderValue(String key) {
193:                        return this .fields.get(key);
194:                    }
195:
196:                    public long getLength() {
197:                        Object o = this .fields.get(LENGTH_FIELD_KEY);
198:                        if (o == null) {
199:                            return -1;
200:                        }
201:                        return ((Long) o).longValue();
202:                    }
203:
204:                    public String getMimetype() {
205:                        return (String) this .fields.get(MIMETYPE_FIELD_KEY);
206:                    }
207:
208:                    public long getOffset() {
209:                        Object o = this .fields.get(ABSOLUTE_OFFSET_KEY);
210:                        if (o == null) {
211:                            return -1;
212:                        }
213:                        return ((Long) o).longValue();
214:                    }
215:
216:                    public String getRecordIdentifier() {
217:                        return (String) this .fields
218:                                .get(RECORD_IDENTIFIER_FIELD_KEY);
219:                    }
220:
221:                    public String getUrl() {
222:                        return (String) this .fields.get(URL_FIELD_KEY);
223:                    }
224:
225:                    public String getVersion() {
226:                        return (String) this .fields.get(VERSION_FIELD_KEY);
227:                    }
228:
229:                    public int getContentBegin() {
230:                        return this .contentBegin;
231:                    }
232:
233:                    @Override
234:                    public String toString() {
235:                        return this .fields.toString();
236:                    }
237:                };
238:            }
239:
240:            protected int parseHeaderLine(final InputStream in,
241:                    final Map<Object, Object> fields, final boolean strict)
242:                    throws IOException {
243:                byte[] line = readLine(in, strict);
244:                if (line.length <= 2) {
245:                    throw new IOException("No Header Line found");
246:                }
247:                // Strip the CRLF.
248:                String headerLine = new String(line, 0, line.length - 2,
249:                        HEADER_LINE_ENCODING);
250:                Matcher m = HEADER_LINE.matcher(headerLine);
251:                if (!m.matches()) {
252:                    throw new IOException("Failed parse of Header Line: "
253:                            + headerLine);
254:                }
255:                for (int i = 0; i < HEADER_FIELD_KEYS.length; i++) {
256:                    if (i == 1) {
257:                        // Do length of Record as a Long.
258:                        fields.put(HEADER_FIELD_KEYS[i], Long.parseLong(m
259:                                .group(i + 1)));
260:                        continue;
261:                    }
262:                    fields.put(HEADER_FIELD_KEYS[i], m.group(i + 1));
263:                }
264:
265:                return line.length;
266:            }
267:
268:            /**
269:             * Read a line.
270:             * A 'line' in this context ends in CRLF and contains ascii-only and no
271:             * control-characters.
272:             * @param in InputStream to read.
273:             * @param strict Strict parsing (If false, we'll eat whitespace before the
274:             * record.
275:             * @return All bytes in line including terminating CRLF.
276:             * @throws IOException
277:             */
278:            protected byte[] readLine(final InputStream in, final boolean strict)
279:                    throws IOException {
280:                boolean done = false;
281:                boolean recordStart = strict;
282:                int read = 0;
283:                ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 /*SWAG*/);
284:                for (int c = -1, previousCharacter; !done;) {
285:                    if (read++ >= MAX_LINE_LENGTH) {
286:                        throw new IOException("Read " + MAX_LINE_LENGTH
287:                                + " bytes without finding CRLF");
288:                    }
289:                    previousCharacter = c;
290:                    c = in.read();
291:                    if (c == -1) {
292:                        throw new IOException("End-Of-Stream before CRLF:\n"
293:                                + new String(baos.toByteArray()));
294:                    }
295:                    if (isLF((char) c) && isCR((char) previousCharacter)) {
296:                        done = true;
297:                    } else if (!recordStart && Character.isWhitespace(c)) {
298:                        // Skip any whitespace at start.
299:                        continue;
300:                    } else {
301:                        if (isCR((char) previousCharacter)) {
302:                            // If previous character was a CR and this character is not
303:                            // a LF, we tested above, thats illegal.
304:                            throw new IOException("CR in middle of Header:\n"
305:                                    + new String(baos.toByteArray()));
306:                        }
307:
308:                        // Not whitespace so start record if we haven't already.
309:                        if (!recordStart) {
310:                            recordStart = true;
311:                        }
312:                    }
313:                    baos.write(c);
314:                }
315:                return baos.toByteArray();
316:            }
317:
318:            protected void parseNamedFields(final InputStream in,
319:                    final Map<Object, Object> fields) throws IOException {
320:                ANVLRecord r = ANVLRecord.load(in);
321:                fields.putAll(r.asMap());
322:            }
323:
324:            public static boolean isCROrLF(final char c) {
325:                return isCR(c) || isLF(c);
326:            }
327:
328:            public static boolean isCR(final char c) {
329:                return c == CRLF.charAt(0);
330:            }
331:
332:            public static boolean isLF(final char c) {
333:                return c == CRLF.charAt(1);
334:            }
335:
336:            @Override
337:            protected String getMimetype4Cdx(ArchiveRecordHeader h) {
338:                final String m = super .getMimetype4Cdx(h);
339:                // Mimetypes can have spaces in WARCs.  Emitting for CDX, just
340:                // squash them for now.  Later, quote them since squashing spaces won't
341:                // work for params that have quoted-string values.
342:                Matcher matcher = WHITESPACE.matcher(m);
343:                return matcher.replaceAll("");
344:            }
345:        }

www.java2java.com | Contact Us

All other trademarks are property of their respective owners.