001: /* ANVLRecord
002: *
003: * $Id: ANVLRecord.java 4545 2006-08-26 00:33:38Z stack-sf $
004: *
005: * Created on July 26, 2006.
006: *
007: * Copyright (C) 2006 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.util.anvl;
026:
027: import java.io.ByteArrayOutputStream;
028: import java.io.IOException;
029: import java.io.InputStream;
030: import java.io.UnsupportedEncodingException;
031: import java.util.ArrayList;
032: import java.util.Collection;
033: import java.util.HashMap;
034: import java.util.Iterator;
035: import java.util.List;
036: import java.util.Map;
037:
038: import org.archive.io.UTF8Bytes;
039:
040: /**
041: * An ordered {@link List} with 'data' {@link Element} values.
042: * ANVLRecords end with a blank line.
043: *
044: * @see <a
045: * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value
046: * Language (ANVL)</a>
047: * @author stack
048: */
049: public class ANVLRecord extends ArrayList<Element> implements UTF8Bytes {
050: private static final long serialVersionUID = -4610638888453052958L;
051:
052: public static final String MIMETYPE = "text/anvl";
053:
054: public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord();
055:
056: /**
057: * Arbitrary upper bound on maximum size of ANVL Record.
058: * Will throw an IOException if exceed this size.
059: */
060: public static final long MAXIMUM_SIZE = 1024 * 10;
061:
062: /**
063: * An ANVL 'newline'.
064: * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a>
065: */
066: static final String CRLF = "\r\n";
067:
068: static final String FOLD_PREFIX = CRLF + ' ';
069:
070: public ANVLRecord() {
071: super ();
072: }
073:
074: public ANVLRecord(Collection<? extends Element> c) {
075: super (c);
076: }
077:
078: public ANVLRecord(int initialCapacity) {
079: super (initialCapacity);
080: }
081:
082: public boolean addLabel(final String l) {
083: return super .add(new Element(new Label(l)));
084: }
085:
086: public boolean addLabelValue(final String l, final String v) {
087: return super .add(new Element(new Label(l), new Value(v)));
088: }
089:
090: @Override
091: public String toString() {
092: // TODO: What to emit for empty ANVLRecord?
093: StringBuilder sb = new StringBuilder();
094: for (final Iterator<Element> i = iterator(); i.hasNext();) {
095: sb.append(i.next());
096: sb.append(CRLF);
097: }
098: // 'ANVL Records end in a blank line'.
099: sb.append(CRLF);
100: return sb.toString();
101: }
102:
103: public Map<String, String> asMap() {
104: Map<String, String> m = new HashMap<String, String>(size());
105: for (final Iterator<Element> i = iterator(); i.hasNext();) {
106: Element e = i.next();
107: m.put(e.getLabel().toString(), e.isValue() ? e.getValue()
108: .toString() : (String) null);
109: }
110: return m;
111: }
112:
113: @Override
114: public ANVLRecord clone() {
115: return new ANVLRecord(this );
116: }
117:
118: /**
119: * @return This ANVLRecord as UTF8 bytes.
120: */
121: public byte[] getUTF8Bytes() throws UnsupportedEncodingException {
122: return toString().getBytes(UTF8);
123: }
124:
125: /**
126: * Parses a single ANVLRecord from passed InputStream.
127: * Read as a single-byte stream until we get to a CRLFCRLF which
128: * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream.
129: * Doing it this way, while requiring a double-scan, it makes it so do not
130: * need to be passed a RepositionableStream or a Stream that supports
131: * marking. Also no danger of over-reading which can happen when we
132: * wrap passed Stream with an InputStreamReader for doing UTF-8
133: * character conversion (See the ISR class comment).
134: * @param is InputStream
135: * @return An ANVLRecord instance.
136: * @throws IOException
137: */
138: public static ANVLRecord load(final InputStream is)
139: throws IOException {
140: // It doesn't look like a CRLF sequence is possible in UTF-8 without
141: // it signifying CRLF: The top bits are set in multibyte characters.
142: // Was thinking of recording CRLF as I was running through this first
143: // parse but the offsets would then be incorrect if any multibyte
144: // characters in the intervening gaps between CRLF.
145: boolean isCRLF = false;
146: boolean recordStart = false;
147: ByteArrayOutputStream baos = new ByteArrayOutputStream(1024);
148: boolean done = false;
149: int read = 0;
150: for (int c = -1, previousCharacter; !done;) {
151: if (read++ >= MAXIMUM_SIZE) {
152: throw new IOException("Read " + MAXIMUM_SIZE
153: + " bytes without finding \\r\\n\\r\\n "
154: + "End-Of-ANVLRecord");
155: }
156: previousCharacter = c;
157: c = is.read();
158: if (c == -1) {
159: throw new IOException(
160: "End-Of-Stream before \\r\\n\\r\\n "
161: + "End-Of-ANVLRecord:\n"
162: + new String(baos.toByteArray(), UTF8));
163: }
164: if (isLF((char) c) && isCR((char) previousCharacter)) {
165: if (isCRLF) {
166: // If we just had a CRLF, then its two CRLFs and its end of
167: // record. We're done.
168: done = true;
169: } else {
170: isCRLF = true;
171: }
172: } else if (!recordStart && Character.isWhitespace(c)) {
173: // Skip any whitespace at start of ANVLRecord.
174: continue;
175: } else {
176: // Clear isCRLF flag if this character is NOT a '\r'.
177: if (isCRLF && !isCR((char) c)) {
178: isCRLF = false;
179: }
180: // Not whitespace so start record if we haven't already.
181: if (!recordStart) {
182: recordStart = true;
183: }
184: }
185: baos.write(c);
186: }
187: return load(new String(baos.toByteArray(), UTF8));
188: }
189:
190: /**
191: * Parse passed String for an ANVL Record.
192: * Looked at writing javacc grammer but preprocessing is required to
193: * handle folding: See
194: * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173.
195: * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count.
196: * A value of 3 would help with folding. But its a pain defining UNICODE
197: * grammers -- needed by ANVL -- and support seems incomplete
198: * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode.
199: * For now, go with the below hand-rolled parser.
200: * @param s String with an ANVLRecord.
201: * @return ANVLRecord parsed from passed String.
202: * @throws IOException
203: */
204: public static ANVLRecord load(final String s) throws IOException {
205: ANVLRecord record = new ANVLRecord();
206: boolean inValue = false, inLabel = false, inComment = false, inNewLine = false;
207: String label = null;
208: StringBuilder sb = new StringBuilder(s.length());
209: for (int i = 0; i < s.length(); i++) {
210: char c = s.charAt(i);
211:
212: // Assert I can do look-ahead.
213: if ((i + 1) > s.length()) {
214: throw new IOException("Premature End-of-ANVLRecord:\n"
215: + s.substring(i));
216: }
217:
218: // If at LF of a CRLF, just go around again. Eat up the LF.
219: if (inNewLine && isLF(c)) {
220: continue;
221: }
222:
223: // If we're at a CRLF and we were just on one, exit. Found Record.
224: if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) {
225: break;
226: }
227:
228: // Check if we're on a fold inside a Value. Skip multiple white
229: // space after CRLF.
230: if (inNewLine && inValue && Character.isWhitespace(c)) {
231: continue;
232: }
233:
234: // Else set flag if we're at a CRLF.
235: inNewLine = isCR(c) && isLF(s.charAt(i + 1));
236:
237: if (inNewLine) {
238: if (inComment) {
239: inComment = false;
240: } else if (label != null && !inValue) {
241: // Label only 'data element'.
242: record.addLabel(label);
243: label = null;
244: sb.setLength(0);
245: } else if (inValue) {
246: // Assert I can do look-ahead past current CRLF.
247: if ((i + 3) > s.length()) {
248: throw new IOException(
249: "Premature End-of-ANVLRecord "
250: + "(2):\n" + s.substring(i));
251: }
252: if (!isCR(s.charAt(i + 2))
253: && !isLF(s.charAt(i + 3))
254: && Character.isWhitespace(s.charAt(i + 2))) {
255: // Its a fold. Let it go around. But add in a CRLF and
256: // space and do it here. We don't let CRLF fall through
257: // to the sb.append on the end of this loop.
258: sb.append(CRLF);
259: sb.append(' ');
260: } else {
261: // Next line is a new SubElement, a new Comment or
262: // Label.
263: record.addLabelValue(label, sb.toString());
264: sb.setLength(0);
265: label = null;
266: inValue = false;
267: }
268: } else {
269: // We're whitespace between label and value or whitespace
270: // before we've figured whether label or comment.
271: }
272: // Don't let the '\r' or CRLF through.
273: continue;
274: }
275:
276: if (inComment) {
277: continue;
278: } else if (inLabel) {
279: if (c == Label.COLON) {
280: label = sb.toString();
281: sb.setLength(0);
282: inLabel = false;
283: continue;
284: }
285: } else {
286: if (!inLabel && !inValue && !inComment) {
287: // We have no state. Figure one.
288: if (Character.isWhitespace(c)) {
289: // If no state, and whitespace, skip. Don't record.
290: continue;
291: } else if (label == null && c == '#') {
292: inComment = true;
293: // Don't record comments.
294: continue;
295: } else if (label == null) {
296: inLabel = true;
297: } else {
298: inValue = true;
299: }
300: }
301: }
302: sb.append(c);
303: }
304: return record;
305: }
306:
307: /**
308: * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is
309: * CRLFCRLF so is of size 4. Also, expensive, since it makes String of
310: * the record so it can count bytes.
311: */
312: public synchronized int getLength() {
313: int length = -1;
314: try {
315: length = getUTF8Bytes().length;
316: } catch (UnsupportedEncodingException e) {
317: throw new RuntimeException(e);
318: }
319: return length;
320: }
321:
322: public static boolean isCROrLF(final char c) {
323: return isCR(c) || isLF(c);
324: }
325:
326: public static boolean isCR(final char c) {
327: return c == ANVLRecord.CRLF.charAt(0);
328: }
329:
330: public static boolean isLF(final char c) {
331: return c == ANVLRecord.CRLF.charAt(1);
332: }
333: }
|