001: package net.matuschek.spider;
002:
003: /*********************************************
004: Copyright (c) 2001 by Daniel Matuschek
005: *********************************************/
006:
007: import java.util.HashSet;
008: import java.util.LinkedList;
009:
010: /**
011: * Memory based implementation of the TaskList interface.
012: * Uses a LinkedList for sequential access
013: * and a HashSet for fast retrieval
014: * To be thread safe, all methods are synchronized.
015: * <p>
016: * @author Daniel Matuschek, Jo Christen, Oliver Schmidt
017: * @version $Id: HashedMemoryTaskList.java,v 1.7 2004/08/09 13:07:22 matuschd Exp $*
018: */
019: public class HashedMemoryTaskList implements TaskList {
020:
021: public static int DEFAULT_CAPACITY = 50000;
022:
023: /**
024: * Task store as linked list for sequential access.
025: *
026: * @link aggregation
027: * @associates <{RobotTask}>
028: */
029: //private Vector list = new Vector();
030: private LinkedList list = new LinkedList();
031:
032: /**
033: * Task store as hash set for fast searching
034: * In this list the internal string represantation of
035: * the task will be stored.
036: *
037: * @link aggregation
038: * @associates <{RobotTask}>
039: */
040: private HashSet hashset = null;
041:
042: /**
043: * Defines, if sequential access if allowed.
044: *
045: * Under some circumstances it may be useful to NOT have
046: * sequential access but only to find, if an object is in
047: * the list. E.g. this makes sense for the list of URLs
048: * already visited.
049: */
050: private boolean sequentialAccess;
051:
052: /**
053: * Standard constructor.
054: * (sequentialAccess=true, initialCapacity=DEFAULT_CAPACITY)
055: */
056: public HashedMemoryTaskList() {
057: this (true, DEFAULT_CAPACITY);
058: }
059:
060: /**
061: * Constructor with sequentialAccess-flag
062: *
063: * @param sequentialAccess can be set to false if the removeFirst
064: * and elementAt methods are not used. In this case, storage
065: * of a task will need less memory. This can be interesting if you
066: * have to store lots of objects in this list.
067: */
068: public HashedMemoryTaskList(boolean sequentialAccess) {
069: this (sequentialAccess, DEFAULT_CAPACITY);
070: }
071:
072: /**
073: * Constructor with sequentialAccess-flag and initialCapacity.
074: *
075: * @param sequentialAccess can be set to false if the removeFirst
076: * @param initial capacity (expected document count)
077: * and elementAt methods are not used. In this case, storage
078: * of a task will need less memory. This can be interesting if you
079: * have to store lots of objects in this list.
080: */
081: public HashedMemoryTaskList(boolean sequentialAccess,
082: int initialCapacity) {
083: this .sequentialAccess = sequentialAccess;
084: this .hashset = new HashSet(initialCapacity);
085: }
086:
087: /**
088: * Add a task to the end of the list
089: *
090: * @param task a RobotTask object to store in the queue
091: */
092: public synchronized void add(RobotTask task) {
093: if (this .sequentialAccess) {
094: list.add(task);
095: }
096: hashset.add(task);
097: }
098:
099: /**
100: * Add a task at the beginning of list
101: * @param task a RobotTask object to store in the queue
102: */
103: public synchronized void addAtStart(RobotTask task) {
104: if (this .sequentialAccess) {
105: list.add(0, task);
106: }
107: // we need to put a dummy value object to the HashTable, adding
108: // null as value do not work
109: hashset.add(task);
110: }
111:
112: /**
113: * Clean up the list, remove all objects
114: */
115: public synchronized void clear() {
116: list.clear();
117: hashset.clear();
118: }
119:
120: /**
121: * Is this robot task stored in the list ?
122: */
123: public synchronized boolean contains(RobotTask task) {
124: return hashset.contains(task);
125: }
126:
127: /**
128: * Remove this object from the list
129: */
130: public synchronized boolean remove(RobotTask task) {
131: hashset.remove(task);
132: if (this .sequentialAccess) {
133: return list.remove(task);
134: } else {
135: return true;
136: }
137: }
138:
139: /**
140: * Get and remove the first element.
141: *
142: * @return the first task in the list. This object will also be removed
143: * from the list.
144: * @exception ArrayIndexOutOfBoundsException if the list is empty or
145: * sequential access is not allowed
146: */
147: public synchronized RobotTask removeFirst()
148: throws ArrayIndexOutOfBoundsException {
149: if (!this .sequentialAccess) {
150: throw new ArrayIndexOutOfBoundsException(
151: "sequential access not allowed");
152: }
153: //RobotTask task = (RobotTask)list.elementAt(0);
154: //list.removeElementAt(0);
155: RobotTask task = (RobotTask) list.removeFirst();
156: hashset.remove(task);
157: return task;
158: }
159:
160: /**
161: * Returns the number of elements in this list
162: */
163: public synchronized int size() {
164: return hashset.size();
165: }
166:
167: /**
168: * Get the n-th element in the list. Elements are numbered form 0 to
169: * size-1.
170: * @param position
171: * @exception ArrayIndexOutOfBoundsException if the given position doesn't
172: * exist
173: */
174: public synchronized RobotTask elementAt(int position)
175: throws ArrayIndexOutOfBoundsException {
176: if (!this .sequentialAccess) {
177: throw new ArrayIndexOutOfBoundsException(
178: "sequential access not allowed");
179: }
180: //return (RobotTask)(list.elementAt(position));
181: return (RobotTask) (list.get(position));
182: }
183:
184: }
|