001: /*
002: * Copyright 2005 by Lars Torunski
003: *
004: * Licensed under the Apache License, Version 2.0 (the "License");
005: * you may not use this file except in compliance with the License.
006: * You may obtain a copy of the License at
007: *
008: * http://www.apache.org/licenses/LICENSE-2.0
009: *
010: * Unless required by applicable law or agreed to in writing, software
011: * distributed under the License is distributed on an "AS IS" BASIS,
012: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013: * See the License for the specific language governing permissions and
014: * limitations under the License.
015: *
016: */
017: package com.torunski.crawler.model;
018:
019: import java.util.Collection;
020: import java.util.HashMap;
021: import java.util.Iterator;
022: import java.util.NoSuchElementException;
023: import java.util.TreeSet;
024:
025: import org.apache.commons.logging.Log;
026: import org.apache.commons.logging.LogFactory;
027:
028: import com.torunski.crawler.link.Link;
029: import com.torunski.crawler.link.LinkDepth;
030: import com.torunski.crawler.link.LinkDepthComparator;
031:
032: /**
033: * Simple graph model parser without registrating in and outcoming links.
034: * It stops to a maximum depth.
035: *
036: * @author Lars Torunski
037: * @version $Revision: 1.11 $
038: */
039: public class MaxDepthModel implements ICrawlerModel {
040:
041: private static final transient Log log = LogFactory
042: .getLog(MaxDepthModel.class);
043:
044: /** The default depth */
045: public static final int DEFAULT_MAX_DEPTH = 4;
046:
047: /** The max depth */
048: private int depth;
049:
050: /** A map of the visited links */
051: private HashMap visitedURIs = new HashMap();
052:
053: /** A set of the to be visited links */
054: private TreeSet toVisitURIs = new TreeSet(new LinkDepthComparator());
055:
056: /**
057: * Constructor for Crawler.
058: */
059: public MaxDepthModel() {
060: this (DEFAULT_MAX_DEPTH);
061: }
062:
063: /**
064: * Constructor for Crawler.
065: * @param depth the max depth.
066: */
067: public MaxDepthModel(int depth) {
068: this .depth = depth;
069:
070: log.debug("Crawler model: " + MaxDepthModel.class.getName());
071: log.debug("- max depth=" + depth);
072: }
073:
074: /**
075: * @see com.torunski.crawler.model.ICrawlerModel#isEmpty()
076: */
077: public boolean isEmpty() {
078: // check if there is at least one link left
079: if (toVisitURIs.size() == 0) {
080: return true;
081: }
082:
083: // get the next element (first element in the set)
084: LinkDepth l = (LinkDepth) toVisitURIs.first();
085:
086: return l.getDepth() > depth;
087: }
088:
089: /**
090: * @see com.torunski.crawler.model.ICrawlerModel#pop()
091: */
092: public synchronized Link pop() {
093: // check constraint
094: if (toVisitURIs.size() == 0) {
095: throw new NoSuchElementException(
096: "No more URIs in MaxDepthModel.");
097: }
098:
099: // get the next element and remove it from the list
100: LinkDepth l = (LinkDepth) toVisitURIs.first();
101: toVisitURIs.remove(l);
102:
103: // check constraint
104: if (l.getDepth() > depth) {
105: throw new NoSuchElementException(
106: "Max depth reached in MaxDepthModel.");
107: }
108:
109: // mark this link as visited
110: visitedURIs.put(l.getURI(), l);
111:
112: // return the link
113: return l;
114: }
115:
116: /**
117: * @see com.torunski.crawler.model.ICrawlerModel#add(com.torunski.crawler.link.Link, java.lang.String)
118: */
119: public void add(Link origin, String uri) {
120: int originDepth = getDepth(origin);
121: addInternal(originDepth, uri);
122: }
123:
124: /**
125: * @see com.torunski.crawler.model.ICrawlerModel#add(com.torunski.crawler.link.Link, java.util.Collection)
126: */
127: public void add(Link origin, Collection uri) {
128: int originDepth = getDepth(origin);
129:
130: Iterator iter = uri.iterator();
131: while (iter.hasNext()) {
132: addInternal(originDepth, (String) iter.next());
133: }
134: }
135:
136: /**
137: * @see com.torunski.crawler.model.ICrawlerModel#getVisitedURIs()
138: */
139: public Collection getVisitedURIs() {
140: return visitedURIs.values();
141: }
142:
143: /**
144: * @see com.torunski.crawler.model.ICrawlerModel#getToVisitURIs()
145: */
146: public Collection getToVisitURIs() {
147: return toVisitURIs;
148: }
149:
150: // --- internal methods and/or classes ---
151:
152: /**
153: * @param Link the URI of the link
154: * @return returns the depth of a visited uri.
155: */
156: private int getDepth(Link link) {
157: if (link == null) {
158: return -1;
159: }
160:
161: LinkDepth l = (LinkDepth) visitedURIs.get(link.getURI());
162:
163: if (l != null) {
164: return l.getDepth();
165: } else {
166: // this is the root
167: return -1;
168: }
169: }
170:
171: /** HashMap to avoid that links are added more than once */
172: private HashMap foundLinks = new HashMap();
173:
174: /**
175: * Adds a new URI to the to be visted list.
176: * @param originDepth the depth of the origin URI
177: * @param uri the link of the new URI
178: */
179: private synchronized void addInternal(int originDepth, String uri) {
180: // find the link via the hashcode
181: LinkDepth l = (LinkDepth) foundLinks.get(uri);
182:
183: // the depth of the uri
184: int depth = originDepth + 1;
185:
186: // is the link new
187: if (l == null) {
188: l = createLink(null, uri, depth);
189: foundLinks.put(uri, l);
190: toVisitURIs.add(l);
191: } else {
192: // check if depth is to change
193: if (depth < l.getDepth()) {
194: toVisitURIs.remove(l);
195: l.setDepth(depth);
196: toVisitURIs.add(l);
197: }
198: }
199: }
200:
201: /**
202: * @see com.torunski.crawler.model.ICrawlerModel#createLink(java.lang.String, java.lang.String)
203: */
204: public Link createLink(String originUri, String uri) {
205: return createLink(originUri, uri, 0);
206: }
207:
208: /**
209: * Creates a <b>new</b> link based on the orginUri and uri parameter.
210: * @param originUri the origin URI of the link
211: * @param uri the new URI
212: * @param depth the depth of the link
213: */
214: public LinkDepth createLink(String originUri, String uri, int depth) {
215: return new LinkDepth(uri, depth);
216: }
217:
218: }
|