001: /* ARHostQueueTest.java
002: *
003: * Created on Sep 13, 2004
004: *
005: * Copyright (C) 2004 Kristinn Sigur?sson.
006: *
007: * This file is part of the Heritrix web crawler (crawler.archive.org).
008: *
009: * Heritrix is free software; you can redistribute it and/or modify
010: * it under the terms of the GNU Lesser Public License as published by
011: * the Free Software Foundation; either version 2.1 of the License, or
012: * any later version.
013: *
014: * Heritrix is distributed in the hope that it will be useful,
015: * but WITHOUT ANY WARRANTY; without even the implied warranty of
016: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
017: * GNU Lesser Public License for more details.
018: *
019: * You should have received a copy of the GNU Lesser Public License
020: * along with Heritrix; if not, write to the Free Software
021: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
022: */
023: package org.archive.crawler.frontier;
024:
025: import java.io.File;
026:
027: import org.archive.crawler.datamodel.CrawlURI;
028: import org.archive.net.UURI;
029: import org.archive.net.UURIFactory;
030: import org.archive.util.TmpDirTestCase;
031: import org.archive.util.FileUtils;
032:
033: import com.sleepycat.bind.serial.StoredClassCatalog;
034: import com.sleepycat.je.DatabaseConfig;
035: import com.sleepycat.je.Environment;
036: import com.sleepycat.je.EnvironmentConfig;
037:
038: /**
039: * A JUnit test for {@link AdaptiveRevisitHostQueue AdaptiveRevisitHostQueue}
040: * class.
041: * <p>
042: * Since the ARHostQueue maintains significant state information there is only
043: * one Unit test described here that tests various different transitions.
044: *
045: * @author Kristinn Sigurdsson
046: */
047: public class AdaptiveRevisitHostQueueTest extends TmpDirTestCase
048: implements AdaptiveRevisitAttributeConstants {
049: public void testHQ() throws Exception {
050: EnvironmentConfig envConfig = new EnvironmentConfig();
051: envConfig.setTransactional(true);
052: envConfig.setAllowCreate(true);
053: File envDir = new File(getTmpDir(), "AR");
054: if (envDir.exists()) {
055: FileUtils.deleteDir(envDir);
056: }
057: envDir.mkdirs();
058: Environment env = new Environment(envDir, envConfig);
059: // Open the class catalog database. Create it if it does not
060: // already exist.
061: DatabaseConfig dbConfig = new DatabaseConfig();
062: dbConfig.setAllowCreate(true);
063: StoredClassCatalog catalog = new StoredClassCatalog(env
064: .openDatabase(null, "classes", dbConfig));
065: AdaptiveRevisitHostQueue hq = new AdaptiveRevisitHostQueue(
066: "bok.hi.is", env, catalog, 1);
067:
068: // Make the CrawlUris
069: CrawlURI[] curis = { null, null, null, null };
070:
071: UURI uuri = UURIFactory.getInstance("http://bok.hi.is/1.html");
072: curis[0] = new CrawlURI(uuri);
073: curis[0].setVia(null);
074:
075: uuri = UURIFactory.getInstance("http://bok.hi.is/2.html");
076: curis[1] = new CrawlURI(uuri);
077: curis[1].setVia(null);
078:
079: uuri = UURIFactory.getInstance("http://bok.hi.is/3.html");
080: curis[2] = new CrawlURI(uuri);
081: curis[2].setVia(null);
082:
083: uuri = UURIFactory.getInstance("http://bok.hi.is/4.html");
084: curis[3] = new CrawlURI(uuri);
085: curis[3].setVia(null);
086:
087: assertTrue("HQ should be empty initially",
088: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
089: assertEquals("Incorrect nextReadyTime on Empty",
090: Long.MAX_VALUE, hq.getNextReadyTime());
091: assertEquals("Initial size of HQ should be 0", 0, hq.getSize());
092:
093: assertEquals(
094: "Peek should return null when 'ready queue' is empty",
095: null, hq.peek());
096:
097: /*
098: * Add three CrawlURIs and ensures that the correct one is reported by
099: * peek(); All are added later then current time!
100: */
101:
102: curis[0].putLong(A_TIME_OF_NEXT_PROCESSING, System
103: .currentTimeMillis()); // now
104: curis[1].putLong(A_TIME_OF_NEXT_PROCESSING, System
105: .currentTimeMillis() + 5000); // in 5 sec
106: curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, System
107: .currentTimeMillis() + 20000); // in 20 sec.
108:
109: hq.add(curis[0], false);
110: assertEquals("First CrawlURI should be top", curis[0]
111: .toString(), hq.peek().toString());
112: assertTrue("HQ should no longer be empty",
113: hq.getState() != AdaptiveRevisitHostQueue.HQSTATE_EMPTY);
114: assertEquals("Size of HQ should now be 1", 1, hq.getSize());
115:
116: /*
117: * Invoke next and ensure that the HQ is now busy (initial valence was
118: * set to 1). Also check for proper errors for a busy HQ. Such as when
119: * trying to reinvoke next().
120: *
121: */
122: CrawlURI curi = hq.next(); // Should return curis[2]
123: assertEquals("next() did not return 'top' URI", curis[0]
124: .toString(), curi.toString());
125: assertTrue("HQ should now be busy, is " + hq.getStateByName(),
126: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
127: try {
128: hq.next();
129: assertTrue(
130: "next() should throw an IllegalStateException if HQ "
131: + "not ready", false);
132: } catch (IllegalStateException e) {
133: // This is supposed to happen.
134: }
135: assertEquals("New top URI should be null", null, hq.peek());
136:
137: hq.add(curis[1], false);
138: assertEquals("Second CrawlURI should be top", curis[1]
139: .toString(), hq.peek().toString());
140: assertEquals("Size of HQ should now be 2", 2, hq.getSize());
141:
142: // Return it with next fetch time in the future.
143: curi.putLong(A_TIME_OF_NEXT_PROCESSING, hq.peek().getLong(
144: A_TIME_OF_NEXT_PROCESSING) + 100000); // 100 sec behind current top.
145: hq.update(curi, false, 0);
146: assertEquals("Second CrawlURI should be still be top", curis[1]
147: .toString(), hq.peek().toString());
148: assertEquals("Size of HQ should still be 2", 2, hq.getSize());
149:
150: hq.add(curis[2], false);
151: assertEquals("Second CrawlURI should still be top", curis[1]
152: .toString(), hq.peek().toString());
153: assertEquals("Size of HQ should now be 3", 3, hq.getSize());
154:
155: /*
156: * If there are no URIs ready, the queue should snooze, even though no
157: * politeness demand has been made.
158: * <p>
159: * Confirms this and that it wakes up.
160: */
161: assertTrue(
162: "HQ should be snoozed, is " + hq.getStateByName(),
163: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
164: // Wait past wakeup time
165: synchronized (this ) {
166: wait(hq.getNextReadyTime() - System.currentTimeMillis()
167: + 100);
168: }
169: assertTrue("HQ should now be ready, is " + hq.getStateByName(),
170: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
171:
172: /*
173: * Re-adds a URI with a lower ready time which should promote it to the
174: * top of the queue. Checks if this happens correctly.
175: *
176: * Then tests an add override which would demote it back, ensures that
177: * this fails as it should (i.e. URIs time of next processing remains
178: * unchanged).
179: */
180: curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
181: .getLong(A_TIME_OF_NEXT_PROCESSING) - 1000); // 1 sec. prior to current top
182: hq.add(curis[2], true);
183: assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
184: assertEquals("Third CrawlURI should be now be top", curis[2]
185: .toString(), hq.peek().toString());
186: curis[2].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
187: .getLong(A_TIME_OF_NEXT_PROCESSING) + 10000); // 10 sec. later
188: hq.add(curis[2], true);
189: assertEquals("Size of HQ should still be 3", hq.getSize(), 3);
190: assertEquals("Third CrawlURI should still top", curis[2]
191: .toString(), hq.peek().toString());
192:
193: /*
194: * Invoke next and ensure that the HQ is now busy (initial valence was
195: * set to 1). Also check for proper errors for a busy HQ. Such as when
196: * trying to reinvoke next().
197: *
198: */
199: curi = hq.next(); // Should return curis[2]
200: assertEquals("next() did not return 'top' URI", curis[2]
201: .toString(), curi.toString());
202: assertTrue("HQ should now be busy, is " + hq.getStateByName(),
203: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
204: try {
205: hq.next();
206: assertTrue(
207: "next() should throw an IllegalStateException if HQ "
208: + "not ready", false);
209: } catch (IllegalStateException e) {
210: // This is supposed to happen.
211: }
212: assertEquals("New top URI", curis[1].toString(), hq.peek()
213: .toString());
214:
215: /*
216: * Add a URI while HQ is busy. Check if this succeeds normally.
217: *
218: */
219:
220: curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
221: .getLong(A_TIME_OF_NEXT_PROCESSING) - 1); // 1 msec. ahead of current top (order [2] 3 1 0)
222: hq.add(curis[3], false);
223: assertEquals("Size of HQ should now be 4", 4, hq.getSize());
224:
225: /*
226: * Invoke update, first with an invalid URI (not the one issued by
227: * next() earlier), this should fail. Then with the correct one, this
228: * should succeed. Then finally test update again with an invalid URI
229: * (i.e. when no HQ has no outstanding URIs, that should fail.
230: *
231: * At each step, proper checks are made of state and that methods give
232: * appropriate errors.
233: *
234: * Updated URI is given low time of next processing to put it 'in front'
235: */
236:
237: try {
238: hq.update(curis[1], false, 0);
239: assertTrue("update() should not accept URI", false);
240: } catch (IllegalStateException e) {
241: // This is supposed to happen
242: }
243:
244: // We do not change the 'time of next processing' on update
245: // so curis[2] should again be at top of queue.
246: long timeOfPolitenessWakeUp = System.currentTimeMillis() + 2000;
247: hq.update(curi, true, timeOfPolitenessWakeUp); // Wake in 5 sec.
248: assertTrue(
249: "HQ should be snoozed, is " + hq.getStateByName(),
250: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_SNOOZED);
251:
252: try {
253: hq.update(curis[2], false, 0);
254: assertTrue("update() should not accept URI", false);
255: } catch (IllegalStateException e) {
256: // This is supposed to happen
257: }
258: assertEquals(
259: "HQs time of next ready should reflect set wait time ",
260: timeOfPolitenessWakeUp, hq.getNextReadyTime());
261:
262: /*
263: * Check if the HQ wakes up from it's 'snoozing'
264: *
265: */
266: // Wait past wakeup time
267: synchronized (this ) {
268: wait(hq.getNextReadyTime() - System.currentTimeMillis()
269: + 100);
270: }
271: assertTrue("HQ should now be ready, is " + hq.getStateByName(),
272: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
273: assertEquals(
274: "HQs time of next ready should still be when it 'woken' "
275: + "up.", timeOfPolitenessWakeUp, hq
276: .getNextReadyTime());
277:
278: /*
279: * Invoke next so that the HQ has a URI being processed. Then
280: * close the HQ and reopen it to ensure that this happens normally, i.e.
281: * state is recovered properly, including the restoration of the URI
282: * being processed, back to the regular queue (where it should be
283: * first).
284: *
285: * On recreating the HQ, set valence to 2.
286: */
287: curi = hq.next(); // Should return curis[2]
288: assertEquals("next() did not return 'top' URI", curis[2]
289: .toString(), curi.toString());
290: assertTrue("HQ should now be busy, is " + hq.getStateByName(),
291: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
292: hq.close();
293:
294: hq = new AdaptiveRevisitHostQueue("bok.hi.is", env, catalog, 2);
295:
296: assertEquals("Size of HQ after reopening should now be 4", 4,
297: hq.getSize());
298: assertTrue("HQ should be ready on reopen, is "
299: + hq.getStateByName(),
300: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
301: assertEquals("CrawlURI 'in processing' before should be top",
302: curi.toString(), hq.peek().toString());
303:
304: /* Check if valence higher then 1 is properly handled.
305: *
306: * Invoke next(), check if still ready and new top URI.
307: */
308: curi = hq.next(); // Should return curis[2]
309: assertEquals("next() did not return 'top' URI", curis[2]
310: .toString(), curi.toString());
311: assertTrue("HQ should still be ready, is "
312: + hq.getStateByName(),
313: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
314:
315: /* Invoke next() again, check if now busy.
316: */
317: curi = hq.next(); // Should return curis[3]
318: assertEquals("next() did not return 'top' URI", curis[3]
319: .toString(), curi.toString());
320: assertTrue("HQ should be busy, is " + hq.getStateByName(), hq
321: .getState() == AdaptiveRevisitHostQueue.HQSTATE_BUSY);
322: assertEquals("Size of HQ should still be 4", 4, hq.getSize());
323:
324: /* Update() second URI issued. Confirm HQ is now ready again. URI is
325: * given same time of next processing to put it 'in front'. (no snooze)
326: */
327: hq.update(curi, false, 0);
328: assertTrue("HQ should now be ready, is " + hq.getStateByName(),
329: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
330: assertEquals("'updated' CrawlURI before should be top", curi
331: .toString(), hq.peek().toString());
332:
333: /* Update() again, ensure proper state. URI is NOT placed at front of
334: * queue and snooze time is given. But the HQ should not enter a
335: * snoozed state because the 'other' slot is free.
336: */
337:
338: hq.update(curis[2], true, System.currentTimeMillis() + 1000000); // 10sec
339: curis[3].putLong(A_TIME_OF_NEXT_PROCESSING, curis[1]
340: .getLong(A_TIME_OF_NEXT_PROCESSING) + 1000); // 1 sec. behind of current top
341: assertTrue("HQ should still be ready, is "
342: + hq.getStateByName(),
343: hq.getState() == AdaptiveRevisitHostQueue.HQSTATE_READY);
344: assertEquals("Top CrawlURI before should be unchanged", curi
345: .toString(), hq.peek().toString());
346:
347: // TODO: Test sorting with scheduling directives.
348:
349: /*
350: * Close the ARHostQueue and the Environment
351: */
352: hq.close();
353: catalog.close();
354: env.close();
355: cleanUpOldFiles("AR");
356: }
357:
358: }
|