001: package net.javacoding.jspider.core.model;
002:
003: import net.javacoding.jspider.api.model.*;
004: import net.javacoding.jspider.core.storage.exception.InvalidStateForActionException;
005: import net.javacoding.jspider.core.storage.exception.InvalidStateTransitionException;
006: import net.javacoding.jspider.core.storage.spi.StorageSPI;
007: import net.javacoding.jspider.core.logging.LogFactory;
008: import net.javacoding.jspider.core.util.URLUtil;
009:
010: import java.io.InputStream;
011: import java.net.URL;
012: import java.util.*;
013:
014: /**
015: *
016: * $Id: ResourceInternal.java,v 1.13 2003/04/11 16:37:04 vanrogu Exp $
017: *
018: * @author Günther Van Roey
019: */
020: public class ResourceInternal implements ParsedResource,
021: ParseErrorResource, ParseIgnoredResource, ForbiddenResource,
022: FetchIgnoredResource, FetchErrorResource {
023:
024: protected StorageSPI storage;
025:
026: protected int site;
027:
028: protected URL url;
029: protected Date discoveryTime;
030: protected FolderInternal folder;
031: protected int state;
032: protected int id;
033:
034: protected int httpStatus;
035: protected int size;
036: protected int timeMs;
037: protected String mimeType;
038: protected Date fetchTime;
039: protected HTTPHeader[] headers;
040:
041: protected Decision spiderDecision;
042: protected Decision parseDecision;
043:
044: public ResourceInternal(StorageSPI storage, int id, int siteId,
045: URL url, Date discoveryTime, FolderInternal folder) {
046: this .site = siteId;
047: this .storage = storage;
048: this .id = id;
049: this .url = url;
050: this .discoveryTime = discoveryTime;
051: this .folder = folder;
052:
053: this .state = Resource.STATE_DISCOVERED;
054: }
055:
056: public ResourceInternal(StorageSPI storage, int id, Site site,
057: URL url, Date discoveryTime, FolderInternal folder) {
058: this (storage, id, ((SiteInternal) site).getId(), url,
059: discoveryTime, folder);
060: }
061:
062: public ResourceInternal(StorageSPI storage, Site site, URL url,
063: Date discoveryTime, FolderInternal folder) {
064: this (storage, 0, site, url, discoveryTime, folder);
065: }
066:
067: public void setFetched(int httpStatus, int size, int timeMs,
068: String mimeType, Date fetchTime, HTTPHeader[] headers) {
069: if (state != Resource.STATE_DISCOVERED) {
070: LogFactory.getLog(Resource.class).error(
071: "error in state transition for resource " + url
072: + ":\n" + this );
073: throw new InvalidStateTransitionException(
074: "cannot set resource fetched - it's not in the discovered state - was "
075: + state);
076: }
077: this .httpStatus = httpStatus;
078: this .size = size;
079: this .timeMs = timeMs;
080: this .mimeType = mimeType;
081: this .fetchTime = fetchTime;
082: this .headers = headers;
083: state = Resource.STATE_FETCHED;
084: }
085:
086: public void setFetchError(int httpStatus, HTTPHeader[] headers) {
087: if (state != Resource.STATE_DISCOVERED
088: && state != Resource.STATE_FETCH_ERROR) {
089: LogFactory.getLog(Resource.class).error(
090: "error in state transition for resource " + url
091: + ":\n" + this );
092: throw new InvalidStateTransitionException(
093: "cannot set resource fetch error - it's not in the discovered state - was"
094: + state);
095: }
096: this .httpStatus = httpStatus;
097: this .headers = headers;
098: state = Resource.STATE_FETCH_ERROR;
099: }
100:
101: public void setParseError() {
102: if (state != Resource.STATE_FETCHED
103: && state != Resource.STATE_PARSE_ERROR) {
104: LogFactory.getLog(Resource.class).error(
105: "error in state transition for resource " + url
106: + ":\n" + this );
107: throw new InvalidStateTransitionException(
108: "cannot set resource parse error - it's not in the fetched state - was "
109: + state);
110: }
111: state = Resource.STATE_PARSE_ERROR;
112: }
113:
114: public void setParsed() {
115: if (state != Resource.STATE_FETCHED
116: && state != Resource.STATE_PARSED) {
117: LogFactory.getLog(Resource.class).error(
118: "error in state transition for resource " + url
119: + ":\n" + this );
120: throw new InvalidStateTransitionException(
121: "cannot set resource parsed - it's not in the fetched state - was "
122: + state);
123: }
124: state = Resource.STATE_PARSED;
125: }
126:
127: public void setFetchIgnored() {
128: if (state != Resource.STATE_DISCOVERED
129: && state != Resource.STATE_FETCH_IGNORED) {
130: LogFactory.getLog(Resource.class).error(
131: "error in state transition for resource " + url
132: + ":\n" + this );
133: throw new InvalidStateTransitionException(
134: "cannot set resource fetch_ignored - it's not in the discovered state - was "
135: + state);
136: }
137: state = Resource.STATE_FETCH_IGNORED;
138: }
139:
140: public void setParseIgnored() {
141: if (state != Resource.STATE_FETCHED
142: && state != Resource.STATE_PARSE_IGNORED) {
143: LogFactory.getLog(Resource.class).error(
144: "error in state transition for resource " + url
145: + ":\n" + this );
146: throw new InvalidStateTransitionException(
147: "cannot set resource parse_ignored - it's not in the fetched state - was "
148: + state);
149: }
150: state = Resource.STATE_PARSE_IGNORED;
151: }
152:
153: public void setForbidden() {
154: if (state != Resource.STATE_DISCOVERED
155: && state != Resource.STATE_FETCH_FORBIDDEN) {
156: LogFactory.getLog(Resource.class).error(
157: "error in state transition for resource " + url
158: + ":\n" + this );
159: throw new InvalidStateTransitionException(
160: "cannot set resource forbidden - it's not in the discovered state - was "
161: + state);
162: }
163: state = Resource.STATE_FETCH_FORBIDDEN;
164: }
165:
166: public int getId() {
167: return id;
168: }
169:
170: public void setInt(int id) {
171: this .id = id;
172: }
173:
174: public int getState() {
175: return state;
176: }
177:
178: public String getFileName() {
179: return URLUtil.getFileName(url);
180: }
181:
182: public URL getURL() {
183: return url;
184: }
185:
186: public Site getSite() {
187: return folder.getSite();
188: }
189:
190: public Folder getFolder() {
191: return folder;
192: }
193:
194: public String getName() {
195: return url.getFile();
196: }
197:
198: public Date getDiscoveryTime() {
199: return discoveryTime;
200: }
201:
202: public Resource[] getReferers() {
203: return storage.getResourceDAO().getRefereringResources(this );
204: }
205:
206: public Resource[] getReferencedResources() {
207: if (state != Resource.STATE_PARSED) {
208: throw new InvalidStateForActionException(
209: "cannot get referenced resources if not parsed");
210: }
211: return storage.getResourceDAO().getReferencedResources(this );
212: }
213:
214: public int getHttpStatus() {
215: if (state == Resource.STATE_DISCOVERED) {
216: throw new InvalidStateForActionException(
217: "cannot get http status for a resource that's not fetched");
218: }
219: return httpStatus;
220: }
221:
222: public int getHttpStatusInternal() {
223: return httpStatus;
224: }
225:
226: public void setHttpStatus(int status) {
227: this .httpStatus = status;
228: }
229:
230: public HTTPHeader[] getHeaders() {
231: return headers;
232: }
233:
234: public int getTimeMs() {
235: if (state < Resource.STATE_FETCHED) {
236: throw new InvalidStateForActionException(
237: "cannot get timing for non-fetched resource");
238: }
239: return timeMs;
240: }
241:
242: public int getTimeMsInternal() {
243: return timeMs;
244: }
245:
246: public int getSize() {
247: if (state < Resource.STATE_FETCHED) {
248: throw new InvalidStateForActionException(
249: "cannot get size for non-fetched resource");
250: }
251: return size;
252: }
253:
254: public int getSizeInternal() {
255: return size;
256: }
257:
258: public String getMime() {
259: if (state < Resource.STATE_FETCHED) {
260: throw new InvalidStateForActionException(
261: "cannot get mime type for non-fetched resource");
262: }
263: return mimeType;
264: }
265:
266: public String getMimeInternal() {
267: return mimeType;
268: }
269:
270: public Date getFetchTime() {
271: if (state < Resource.STATE_FETCHED) {
272: throw new InvalidStateForActionException(
273: "cannot get fetch time for non-fetched resource");
274: }
275: return fetchTime;
276: }
277:
278: public Date getFetchTimeInternal() {
279: return fetchTime;
280: }
281:
282: public String toString() {
283: StringBuffer sb = new StringBuffer();
284: sb.append(url.toString());
285: sb.append("\n STATUS : ");
286: sb.append(translateState(state));
287: sb.append("\n ");
288: sb.append("\n SPIDER DECISION : ");
289: Decision sd = getSpiderDecision();
290: if (sd == null) {
291: sb.append("\n ");
292: sb.append("[Not yet taken]");
293: } else {
294: DecisionStep[] steps = sd.getSteps();
295: for (int i = 0; i < steps.length; i++) {
296: DecisionStep step = steps[i];
297: sb.append("\n ");
298: sb.append(step.toString());
299: }
300: }
301: sb.append("\n ");
302: sb.append("\n PARSE DECISION : ");
303: Decision pd = getParseDecision();
304: if (pd == null) {
305: sb.append("\n ");
306: sb.append("[Not yet taken]");
307: } else {
308: DecisionStep[] steps = pd.getSteps();
309: for (int i = 0; i < steps.length; i++) {
310: DecisionStep step = steps[i];
311: sb.append("\n ");
312: sb.append(step.toString());
313: }
314: }
315: sb.append("\n");
316:
317: switch (state) {
318: case STATE_DISCOVERED:
319: break;
320: case STATE_FETCH_ERROR:
321: sb.append(" HTTP Status: ");
322: sb.append(this .getHttpStatus());
323: Resource[] referers = this .getReferers();
324: sb.append("\n REFERERS: " + referers.length);
325: for (int i = 0; i < referers.length; i++) {
326: Resource referer = referers[i];
327: sb.append("\n ");
328: sb.append(referer.getURL());
329: }
330: break;
331: case STATE_FETCH_IGNORED:
332: break;
333: case STATE_FETCH_FORBIDDEN:
334: break;
335: case STATE_FETCHED:
336: sb.append(" HTTP Status: ");
337: sb.append(this .getHttpStatus());
338: sb.append(", Content size: ");
339: sb.append(this .getSize());
340: sb.append(", Mime Type: ");
341: sb.append(this .getMime());
342: sb.append(", Fetch time: ");
343: sb.append(this .getTimeMs());
344: break;
345: case STATE_PARSE_ERROR:
346: break;
347: case STATE_PARSE_IGNORED:
348: break;
349: case STATE_PARSED:
350: sb.append(" HTTP Status: ");
351: sb.append(this .getHttpStatus());
352: sb.append(", Content size: ");
353: sb.append(this .getSize());
354: sb.append(", Mime Type: ");
355: sb.append(this .getMime());
356: sb.append(", Fetch time: ");
357: sb.append(this .getTimeMs());
358:
359: referers = this .getReferers();
360: sb.append("\n REFERERS: " + referers.length);
361: for (int i = 0; i < referers.length; i++) {
362: Resource referer = referers[i];
363: sb.append("\n ");
364: sb.append(referer.getURL());
365: }
366:
367: if (state == STATE_PARSED) {
368:
369: Resource[] references = this .getReferencedResources();
370: sb.append("\n REFERENCES: " + references.length);
371: for (int i = 0; i < references.length; i++) {
372: Resource reference = references[i];
373: sb.append("\n ");
374: sb.append(reference.getURL());
375: }
376:
377: EMailAddress[] emails = this .getEmailAddresses();
378: sb.append("\n E-MAIL ADDRESSES: " + emails.length);
379: for (int i = 0; i < emails.length; i++) {
380: EMailAddress email = emails[i];
381: sb.append("\n ");
382: sb.append(email.getAddress());
383: }
384:
385: } else {
386: sb
387: .append("\n EMAIL ADDRESSES and REFERENCES not known [Resource not parsed]");
388: }
389: break;
390: }
391:
392: sb.append("\n");
393:
394: return sb.toString();
395: }
396:
397: protected String translateState(int state) {
398: switch (state) {
399: case Resource.STATE_DISCOVERED:
400: return "DISCOVERED";
401: case Resource.STATE_FETCH_ERROR:
402: return "FETCH_ERROR";
403: case Resource.STATE_PARSE_ERROR:
404: return "PARSE_ERROR";
405: case Resource.STATE_FETCHED:
406: return "FETCHED";
407: case Resource.STATE_FETCH_FORBIDDEN:
408: return "FETCH_FORBIDDEN";
409: case Resource.STATE_FETCH_IGNORED:
410: return "FETCH_IGNORED";
411: case Resource.STATE_PARSE_IGNORED:
412: return "PARSE_IGNORED";
413: case Resource.STATE_PARSED:
414: return "PARSED";
415: default:
416: return "?!? UNKNOWN STATE ?!?";
417:
418: }
419: }
420:
421: public InputStream getInputStream() {
422: return storage.getContentDAO().getInputStream(id);
423: }
424:
425: public void setBytes(byte[] bytes) {
426: storage.getContentDAO().setBytes(id, bytes);
427: }
428:
429: public Date getFetchTimeStamp() {
430: return fetchTime;
431: }
432:
433: public String getStateName() {
434: return translateState(state);
435: }
436:
437: public Decision getSpiderDecision() {
438: return storage.getDecisionDAO().findSpiderDecision(this );
439: }
440:
441: public Decision getParseDecision() {
442: return storage.getDecisionDAO().findParseDecision(this );
443: }
444:
445: public void setState(int state) {
446: this .state = state;
447: }
448:
449: public void setMime(String mime) {
450: this .mimeType = mime;
451: }
452:
453: public void setTime(int ms) {
454: this .timeMs = ms;
455: }
456:
457: public void setSize(int size) {
458: this .size = size;
459: }
460:
461: public int getSiteId() {
462: return site;
463: }
464:
465: public ResourceReference[] getOutgoingReferences() {
466: return storage.getResourceDAO().getOutgoingReferences(this );
467: }
468:
469: public ResourceReference[] getIncomingReferences() {
470: return storage.getResourceDAO().getIncomingReferences(this );
471: }
472:
473: public EMailAddress[] getEmailAddresses() {
474: return storage.getEMailAddressDAO().findByResource(this );
475: }
476:
477: public EMailAddressReference[] getEmailAddressReferences() {
478: return storage.getEMailAddressDAO().findReferencesByResource(
479: this);
480: }
481: }
|