001: /*
002: * Copyright 2005 by Lars Torunski
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.torunski.crawler.model;
018:
019: import java.util.Collection;
020: import java.util.HashMap;
021: import java.util.Iterator;
022:
023: import org.apache.commons.logging.Log;
024: import org.apache.commons.logging.LogFactory;
025:
026: import com.torunski.crawler.link.Link;
027:
028: /**
029: * Fast model with a small memory footprint and cpu usage which stops if a number of pages are parsed.
030: *
031: * @author Lars Torunski
032: * @version $Revision: 1.8 $
033: */
034: public class MaxIterationsModel implements ICrawlerModel {
035:
036: private static final transient Log log = LogFactory
037: .getLog(MaxIterationsModel.class);
038:
039: /** The default number of iterations */
040: public static final int DEFAULT_MAX_ITERATIONS = 32;
041:
042: /** The max iterations */
043: private int iterations;
044:
045: /** A map of the visited links */
046: private HashMap visitedURIs = new HashMap();
047:
048: /** A map of the missed visited links */
049: private HashMap toVisitURIs = new HashMap();
050:
051: /**
052: * Constructor for Crawler.
053: */
054: public MaxIterationsModel() {
055: this (DEFAULT_MAX_ITERATIONS);
056: }
057:
058: /**
059: * Constructor for Crawler.
060: * @param iterations the max iterations.
061: */
062: public MaxIterationsModel(int iterations) {
063: this .iterations = iterations;
064:
065: log.debug("Crawler model: "
066: + MaxIterationsModel.class.getName());
067: log.debug("- max iterations=" + iterations);
068: }
069:
070: /**
071: * @see com.torunski.crawler.model.ICrawlerModel#isEmpty()
072: */
073: public synchronized boolean isEmpty() {
074: return (toVisitURIs.size() == 0) || (iterations <= 0);
075: }
076:
077: /**
078: * @see com.torunski.crawler.model.ICrawlerModel#pop()
079: */
080: public synchronized Link pop() {
081: // reduce the iterations without a check
082: iterations--;
083:
084: // remove a link from the stack
085: Link link = (Link) toVisitURIs.values().iterator().next();
086: toVisitURIs.remove(link.getURI());
087:
088: // mark this link as visited
089: visitedURIs.put(link.getURI(), link);
090:
091: // return the URI
092: return link;
093: }
094:
095: /**
096: * @see com.torunski.crawler.model.ICrawlerModel#add(com.torunski.crawler.link.Link, java.lang.String)
097: */
098: public void add(Link origin, String uri) {
099: // in this crawler type we can ignore the originUri
100: addInternal(uri);
101: }
102:
103: /**
104: * @see com.torunski.crawler.model.ICrawlerModel#add(com.torunski.crawler.link.Link, java.util.Collection)
105: */
106: public void add(Link origin, Collection uri) {
107: // in this crawler type we can ignore the originUri
108: // the rest of the links can be visited (but avoid double entries via the HashSet)
109: Iterator iter = uri.iterator();
110: while (iter.hasNext()) {
111: addInternal((String) iter.next());
112: }
113: }
114:
115: /**
116: * @see com.torunski.crawler.model.ICrawlerModel#getVisitedURIs()
117: */
118: public Collection getVisitedURIs() {
119: return visitedURIs.values();
120: }
121:
122: /**
123: * @see com.torunski.crawler.model.ICrawlerModel#getToVisitURIs()
124: */
125: public Collection getToVisitURIs() {
126: return toVisitURIs.values();
127: }
128:
129: // --- internal methods and/or classes ---
130:
131: /** HashMap to avoid that links are added more than once */
132: private HashMap foundLinks = new HashMap();
133:
134: /**
135: * Adds a new URI to the to be visted list without checking the visited URIs.
136: * @param uri the link of the new URI
137: */
138: private synchronized void addInternal(String uri) {
139: // find the link via the hashcode
140: Link l = (Link) foundLinks.get(uri);
141:
142: // is the link new
143: if (l == null) {
144: l = createLink(null, uri);
145: foundLinks.put(uri, l);
146: toVisitURIs.put(uri, l);
147: }
148: }
149:
150: /**
151: * @see com.torunski.crawler.model.ICrawlerModel#createLink(java.lang.String, java.lang.String)
152: */
153: public Link createLink(String orginUri, String uri) {
154: // avoid duplicated links
155: Link link = (Link) foundLinks.get(uri);
156:
157: // is the link new
158: if (link == null) {
159: link = new Link(uri);
160: foundLinks.put(uri, link); // FIXME double put, see addInternal
161: }
162:
163: return link;
164: }
165:
166: }
|