001: /* DiskFPMergeUriUniqFilter
002: *
003: * $Id: DiskFPMergeUriUniqFilter.java 4340 2006-07-13 06:04:11Z gojomo $
004: *
005: * Created on Dec 14, 2005
006: *
007: * Copyright (C) 2005 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.util;
026:
027: import it.unimi.dsi.fastutil.longs.LongIterators;
028: import it.unimi.dsi.fastutil.longs.LongIterator;
029:
030: import java.io.BufferedInputStream;
031: import java.io.BufferedOutputStream;
032: import java.io.DataInputStream;
033: import java.io.DataOutputStream;
034: import java.io.File;
035: import java.io.FileInputStream;
036: import java.io.FileNotFoundException;
037: import java.io.FileOutputStream;
038: import java.io.IOException;
039: import java.util.NoSuchElementException;
040:
041: import org.archive.util.ArchiveUtils;
042:
043: /**
044: * Crude FPMergeUriUniqFilter using a disk data file of raw longs as the
045: * overall FP record.
046: *
047: * @author gojomo
048: */
049: public class DiskFPMergeUriUniqFilter extends FPMergeUriUniqFilter {
050: long count = 0;
051: File scratchDir;
052: File currentFps;
053: File newFpsFile;
054: DataOutputStream newFps;
055: long newCount;
056: DataInputStream oldFps;
057:
058: public DiskFPMergeUriUniqFilter(File scratchDir) {
059: super ();
060: this .scratchDir = scratchDir;
061: // TODO: Use two scratch locations, to allow IO to be split
062: // over separate disks
063: }
064:
065: /* (non-Javadoc)
066: * @see org.archive.crawler.util.FPMergeUriUniqFilter#beginFpMerge()
067: */
068: protected LongIterator beginFpMerge() {
069: newFpsFile = new File(scratchDir, ArchiveUtils.get17DigitDate()
070: + ".fp");
071: if (newFpsFile.exists()) {
072: throw new RuntimeException(newFpsFile + " exists");
073: }
074: try {
075: newFps = new DataOutputStream(new BufferedOutputStream(
076: new FileOutputStream(newFpsFile)));
077: } catch (FileNotFoundException e) {
078: throw new RuntimeException(e);
079: }
080: newCount = 0;
081: if (currentFps == null) {
082: return LongIterators.EMPTY_ITERATOR;
083: }
084: try {
085: oldFps = new DataInputStream(new BufferedInputStream(
086: new FileInputStream(currentFps)));
087: } catch (FileNotFoundException e1) {
088: throw new RuntimeException(e1);
089: }
090: return new DataFileLongIterator(oldFps);
091: }
092:
093: /* (non-Javadoc)
094: * @see org.archive.crawler.util.FPMergeUriUniqFilter#addNewFp(long)
095: */
096: protected void addNewFp(long fp) {
097: try {
098: newFps.writeLong(fp);
099: newCount++;
100: } catch (IOException e) {
101: throw new RuntimeException(e);
102: }
103: }
104:
105: /* (non-Javadoc)
106: * @see org.archive.crawler.util.FPMergeUriUniqFilter#finishFpMerge()
107: */
108: protected void finishFpMerge() {
109: try {
110: newFps.close();
111: File oldFpsFile = currentFps;
112: currentFps = newFpsFile;
113: if (oldFps != null) {
114: oldFps.close();
115: }
116: if (oldFpsFile != null) {
117: oldFpsFile.delete();
118: }
119: } catch (IOException e) {
120: throw new RuntimeException(e);
121: }
122: count = newCount;
123: }
124:
125: /* (non-Javadoc)
126: * @see org.archive.crawler.datamodel.UriUniqFilter#count()
127: */
128: public long count() {
129: return count;
130: }
131:
132: public class DataFileLongIterator implements LongIterator {
133: DataInputStream in;
134: long next;
135: boolean nextIsValid = false;
136:
137: /**
138: * Construct a long iterator reading from the given
139: * stream.
140: *
141: * @param disStream DataInputStream from which to read longs
142: */
143: public DataFileLongIterator(DataInputStream disStream) {
144: this .in = disStream;
145: }
146:
147: /**
148: * Test whether any items remain; loads next item into
149: * holding 'next' field.
150: *
151: * @see java.util.Iterator#hasNext()
152: */
153: public boolean hasNext() {
154: return nextIsValid ? true : lookahead();
155: }
156:
157: /**
158: * Check if there's a next by trying to read it.
159: *
160: * @return true if 'next' field is filled with a valid next, false otherwise
161: */
162: protected boolean lookahead() {
163: try {
164: next = in.readLong();
165: } catch (IOException e) {
166: return false;
167: }
168: nextIsValid = true;
169: return true;
170: }
171:
172: /**
173: * Return the next item.
174: *
175: * @see java.util.Iterator#next()
176: */
177: public Long next() {
178: if (!hasNext()) {
179: throw new NoSuchElementException();
180: }
181: // 'next' is guaranteed set by a hasNext() which returned true
182: Long returnObj = new Long(this .next);
183: this .nextIsValid = false;
184: return returnObj;
185: }
186:
187: /* (non-Javadoc)
188: * @see java.util.Iterator#remove()
189: */
190: public void remove() {
191: throw new UnsupportedOperationException();
192: }
193:
194: /* (non-Javadoc)
195: * @see it.unimi.dsi.fastutil.longs.LongIterator#nextLong()
196: */
197: public long nextLong() {
198: if (!hasNext()) {
199: throw new NoSuchElementException();
200: }
201: // 'next' is guaranteed non-null by a hasNext() which returned true
202: this .nextIsValid = false; // after this return, 'next' needs refresh
203: return this .next;
204: }
205:
206: /* (non-Javadoc)
207: * @see it.unimi.dsi.fastutil.longs.LongIterator#skip(int)
208: */
209: public int skip(int arg0) {
210: return 0;
211: }
212: }
213:
214: }
|