001: // ResourceInfo.java
002: // -------------------------------------
003: // part of YACY
004: // (C) by Michael Peter Christen; mc@anomic.de
005: // first published on http://www.anomic.de
006: // Frankfurt, Germany, 2006
007: //
008: // This file ist contributed by Martin Thelian
009: //
010: // $LastChangedDate: 2006-02-20 23:57:42 +0100 (Mo, 20 Feb 2006) $
011: // $LastChangedRevision: 1715 $
012: // $LastChangedBy: theli $
013: //
014: // This program is free software; you can redistribute it and/or modify
015: // it under the terms of the GNU General Public License as published by
016: // the Free Software Foundation; either version 2 of the License, or
017: // (at your option) any later version.
018: //
019: // This program is distributed in the hope that it will be useful,
020: // but WITHOUT ANY WARRANTY; without even the implied warranty of
021: // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
022: // GNU General Public License for more details.
023: //
024: // You should have received a copy of the GNU General Public License
025: // along with this program; if not, write to the Free Software
026: // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
027: //
028: // Using this software in any meaning (reading, learning, copying, compiling,
029: // running) means that you agree that the Author(s) is (are) not responsible
030: // for cost, loss of data or any harm that may be caused directly or indirectly
031: // by usage of this softare or this documentation. The usage of this software
032: // is on your own risk. The installation and usage (starting/running) of this
033: // software may allow other people or application to access your computer and
034: // any attached devices and is highly dependent on the configuration of the
035: // software which must be done by the user of the software; the author(s) is
036: // (are) also not responsible for proper configuration and usage of the
037: // software, even if provoked by documentation provided together with
038: // the software.
039: //
040: // Any changes to this file according to the GPL as documented in the file
041: // gpl.txt aside this file in the shipment you received can be done to the
042: // lines that follows this copyright notice here, but changes must not be
043: // done inside the copyright notive above. A re-distribution must contain
044: // the intact and unchanged copyright notice.
045: // Contributions and changes to the program code must be marked as such.
046:
047: package de.anomic.plasma.cache.http;
048:
049: import java.util.Date;
050: import java.util.Map;
051: import java.util.TreeMap;
052:
053: import de.anomic.http.httpHeader;
054: import de.anomic.plasma.plasmaHTCache;
055: import de.anomic.plasma.cache.IResourceInfo;
056: import de.anomic.plasma.cache.ResourceInfoFactory;
057: import de.anomic.server.serverDate;
058: import de.anomic.yacy.yacyURL;
059:
060: public class ResourceInfo implements IResourceInfo {
061: private yacyURL url;
062: private httpHeader responseHeader;
063: private httpHeader requestHeader;
064:
065: /**
066: * Constructor used by the {@link ResourceInfoFactory}
067: * @param objectURL
068: * @param objectInfo
069: */
070: public ResourceInfo(yacyURL objectURL,
071: Map<String, String> objectInfo) {
072: if (objectURL == null)
073: throw new NullPointerException();
074: if (objectInfo == null)
075: throw new NullPointerException();
076:
077: // generating the url hash
078: this .url = objectURL;
079:
080: // create the http header object
081: this .responseHeader = new httpHeader(null, objectInfo);
082: }
083:
084: public ResourceInfo(yacyURL objectURL, httpHeader requestHeaders,
085: httpHeader responseHeaders) {
086: if (objectURL == null)
087: throw new NullPointerException();
088: if (responseHeaders == null)
089: throw new NullPointerException();
090:
091: // generating the url hash
092: this .url = objectURL;
093:
094: this .requestHeader = requestHeaders;
095: this .responseHeader = responseHeaders;
096: }
097:
098: public TreeMap<String, String> getMap() {
099: return this .responseHeader;
100: }
101:
102: /**
103: * @see de.anomic.plasma.cache.IResourceInfo#getMimeType()
104: */
105: public String getMimeType() {
106: if (this .responseHeader == null)
107: return null;
108:
109: String mimeType = this .responseHeader.mime();
110: mimeType = mimeType.trim().toLowerCase();
111:
112: int pos = mimeType.indexOf(';');
113: return ((pos < 0) ? mimeType : mimeType.substring(0, pos));
114: }
115:
116: public String getCharacterEncoding() {
117: if (this .responseHeader == null)
118: return null;
119: return this .responseHeader.getCharacterEncoding();
120: }
121:
122: /**
123: * @see de.anomic.plasma.cache.IResourceInfo#getModificationDate()
124: */
125: public Date getModificationDate() {
126: Date docDate = null;
127:
128: if (this .responseHeader != null) {
129: docDate = this .responseHeader.lastModified();
130: if (docDate == null)
131: docDate = this .responseHeader.date();
132: }
133: if (docDate == null)
134: docDate = new Date(serverDate.correctedUTCTime());
135:
136: return docDate;
137: }
138:
139: public yacyURL getRefererUrl() {
140: if (this .requestHeader == null)
141: return null;
142: try {
143: return new yacyURL((String) this .requestHeader.get(
144: httpHeader.REFERER, ""), null);
145: } catch (Exception e) {
146: return null;
147: }
148: }
149:
150: /**
151: * @see de.anomic.plasma.cache.IResourceInfo#getUrl()
152: */
153: public yacyURL getUrl() {
154: return this .url;
155: }
156:
157: /**
158: * @see de.anomic.plasma.cache.IResourceInfo#getUrlHash()
159: */
160: public String getUrlHash() {
161: return this .url.hash();
162: }
163:
164: public void setRequestHeader(httpHeader reqestHeader) {
165: this .requestHeader = reqestHeader;
166: }
167:
168: /**
169: * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForCrawler()
170: */
171: public String shallIndexCacheForCrawler() {
172: String mimeType = this .getMimeType();
173: if (plasmaHTCache.isPicture(mimeType)) {
174: return "Media_Content_(Picture)";
175: }
176: if (!plasmaHTCache.isText(mimeType)) {
177: return "Media_Content_(not_text)";
178: }
179: return null;
180: }
181:
182: /**
183: * @see de.anomic.plasma.cache.IResourceInfo#shallIndexCacheForProxy()
184: */
185: public String shallIndexCacheForProxy() {
186: // -set-cookie in response
187: // the set-cookie from the server does not indicate that the content is special
188: // thus we do not care about it here for indexing
189:
190: // a picture cannot be indexed
191: String mimeType = this .getMimeType();
192: if (plasmaHTCache.isPicture(mimeType)) {
193: return "Media_Content_(Picture)";
194: }
195: if (!plasmaHTCache.isText(mimeType)) {
196: return "Media_Content_(not_text)";
197: }
198:
199: // -if-modified-since in request
200: // if the page is fresh at the very moment we can index it
201: Date ifModifiedSince = getModificationDate();
202: if ((ifModifiedSince != null)
203: && (this .responseHeader
204: .containsKey(httpHeader.LAST_MODIFIED))) {
205: // parse date
206: Date d = this .responseHeader.lastModified();
207: if (d == null) {
208: d = new Date(serverDate.correctedUTCTime());
209: }
210: // finally, we shall treat the cache as stale if the modification time is after the if-.. time
211: if (d.after(ifModifiedSince)) {
212: //System.out.println("***not indexed because if-modified-since");
213: return "Stale_(Last-Modified>Modified-Since)";
214: }
215: }
216:
217: // -pragma in cached response
218: if (this .responseHeader.containsKey(httpHeader.PRAGMA)
219: && ((String) this .responseHeader.get(httpHeader.PRAGMA))
220: .toUpperCase().equals("NO-CACHE")) {
221: return "Denied_(pragma_no_cache)";
222: }
223:
224: // see for documentation also:
225: // http://www.web-caching.com/cacheability.html
226:
227: // look for freshnes information
228:
229: // -expires in cached response
230: // the expires value gives us a very easy hint when the cache is stale
231: // sometimes, the expires date is set to the past to prevent that a page is cached
232: // we use that information to see if we should index it
233: final Date expires = this .responseHeader.expires();
234: if (expires != null
235: && expires.before(new Date(serverDate
236: .correctedUTCTime()))) {
237: return "Stale_(Expired)";
238: }
239:
240: // -lastModified in cached response
241: // this information is too weak to use it to prevent indexing
242: // even if we can apply a TTL heuristic for cache usage
243:
244: // -cache-control in cached response
245: // the cache-control has many value options.
246: String cacheControl = (String) this .responseHeader
247: .get(httpHeader.CACHE_CONTROL);
248: if (cacheControl != null) {
249: cacheControl = cacheControl.trim().toUpperCase();
250: /* we have the following cases for cache-control:
251: "public" -- can be indexed
252: "private", "no-cache", "no-store" -- cannot be indexed
253: "max-age=<delta-seconds>" -- stale/fresh dependent on date
254: */
255: if (cacheControl.startsWith("PRIVATE")
256: || cacheControl.startsWith("NO-CACHE")
257: || cacheControl.startsWith("NO-STORE")) {
258: // easy case
259: return "Stale_(denied_by_cache-control=" + cacheControl
260: + ")";
261: // } else if (cacheControl.startsWith("PUBLIC")) {
262: // // ok, do nothing
263: } else if (cacheControl.startsWith("MAX-AGE=")) {
264: // we need also the load date
265: final Date date = this .responseHeader.date();
266: if (date == null) {
267: return "Stale_(no_date_given_in_response)";
268: }
269: try {
270: final long ttl = 1000 * Long.parseLong(cacheControl
271: .substring(8)); // milliseconds to live
272: if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
273: //System.out.println("***not indexed because cache-control");
274: return "Stale_(expired_by_cache-control)";
275: }
276: } catch (Exception e) {
277: return "Error_(" + e.getMessage() + ")";
278: }
279: }
280: }
281: return null;
282: }
283:
284: public String shallStoreCacheForProxy() {
285: if (this .requestHeader != null) {
286: // -authorization cases in request
287: // authorization makes pages very individual, and therefore we cannot use the
288: // content in the cache
289: if (this .requestHeader
290: .containsKey(httpHeader.AUTHORIZATION)) {
291: return "personalized";
292: }
293: // -ranges in request and response
294: // we do not cache partial content
295: if (this .requestHeader.containsKey(httpHeader.RANGE)) {
296: return "partial";
297: }
298: }
299:
300: if (this .responseHeader != null) {
301: // -ranges in request and response
302: // we do not cache partial content
303: if (this .responseHeader
304: .containsKey(httpHeader.CONTENT_RANGE)) {
305: return "partial";
306: }
307:
308: // -if-modified-since in request
309: // we do not care about if-modified-since, because this case only occurres if the
310: // cache file does not exist, and we need as much info as possible for the indexing
311:
312: // -cookies in request
313: // we do not care about cookies, because that would prevent loading more pages
314: // from one domain once a request resulted in a client-side stored cookie
315:
316: // -set-cookie in response
317: // we do not care about cookies in responses, because that info comes along
318: // any/many pages from a server and does not express the validity of the page
319: // in modes of life-time/expiration or individuality
320:
321: // -pragma in response
322: // if we have a pragma non-cache, we don't cache. usually if this is wanted from
323: // the server, it makes sense
324: String cacheControl = (String) this .responseHeader
325: .get(httpHeader.PRAGMA);
326: if (cacheControl != null
327: && cacheControl.trim().toUpperCase().equals(
328: "NO-CACHE")) {
329: return "controlled_no_cache";
330: }
331:
332: // -expires in response
333: // we do not care about expires, because at the time this is called the data is
334: // obvious valid and that header info is used in the indexing later on
335:
336: // -cache-control in response
337: // the cache-control has many value options.
338: cacheControl = (String) this .responseHeader
339: .get(httpHeader.CACHE_CONTROL);
340: if (cacheControl != null) {
341: cacheControl = cacheControl.trim().toUpperCase();
342: if (cacheControl.startsWith("MAX-AGE=")) {
343: // we need also the load date
344: Date date = this .responseHeader.date();
345: if (date == null)
346: return "stale_no_date_given_in_response";
347: try {
348: long ttl = 1000 * Long.parseLong(cacheControl
349: .substring(8)); // milliseconds to live
350: if (serverDate.correctedUTCTime()
351: - date.getTime() > ttl) {
352: //System.out.println("***not indexed because cache-control");
353: return "stale_expired";
354: }
355: } catch (Exception e) {
356: return "stale_error_" + e.getMessage() + ")";
357: }
358: }
359: }
360: }
361: return null;
362: }
363:
364: public boolean shallUseCacheForProxy() {
365:
366: String cacheControl;
367: if (this .requestHeader != null) {
368: // -authorization cases in request
369: if (this .requestHeader
370: .containsKey(httpHeader.AUTHORIZATION)) {
371: return false;
372: }
373:
374: // -ranges in request
375: // we do not cache partial content
376: if (this .requestHeader.containsKey(httpHeader.RANGE)) {
377: return false;
378: }
379:
380: // if the client requests a un-cached copy of the resource ...
381: cacheControl = (String) this .requestHeader
382: .get(httpHeader.PRAGMA);
383: if (cacheControl != null
384: && cacheControl.trim().toUpperCase().equals(
385: "NO-CACHE")) {
386: return false;
387: }
388:
389: cacheControl = (String) this .requestHeader
390: .get(httpHeader.CACHE_CONTROL);
391: if (cacheControl != null) {
392: cacheControl = cacheControl.trim().toUpperCase();
393: if (cacheControl.startsWith("NO-CACHE")
394: || cacheControl.startsWith("MAX-AGE=0")) {
395: return false;
396: }
397: }
398:
399: // -if-modified-since in request
400: // The entity has to be transferred only if it has
401: // been modified since the date given by the If-Modified-Since header.
402: if (this .requestHeader
403: .containsKey(httpHeader.IF_MODIFIED_SINCE)) {
404: // checking this makes only sense if the cached response contains
405: // a Last-Modified field. If the field does not exist, we go the safe way
406: if (!this .responseHeader
407: .containsKey(httpHeader.LAST_MODIFIED)) {
408: return false;
409: }
410: // parse date
411: Date d1, d2;
412: d2 = this .responseHeader.lastModified();
413: if (d2 == null) {
414: d2 = new Date(serverDate.correctedUTCTime());
415: }
416: d1 = this .requestHeader.ifModifiedSince();
417: if (d1 == null) {
418: d1 = new Date(serverDate.correctedUTCTime());
419: }
420: // finally, we shall treat the cache as stale if the modification time is after the if-.. time
421: if (d2.after(d1)) {
422: return false;
423: }
424: }
425:
426: String mimeType = this .getMimeType();
427: if (!plasmaHTCache.isPicture(mimeType)) {
428: // -cookies in request
429: // unfortunately, we should reload in case of a cookie
430: // but we think that pictures can still be considered as fresh
431: // -set-cookie in cached response
432: // this is a similar case as for COOKIE.
433: if (this .requestHeader.containsKey(httpHeader.COOKIE)
434: || this .responseHeader
435: .containsKey(httpHeader.SET_COOKIE)
436: || this .responseHeader
437: .containsKey(httpHeader.SET_COOKIE2)) {
438: return false; // too strong
439: }
440: }
441: }
442:
443: // -pragma in cached response
444: // logically, we would not need to care about no-cache pragmas in cached response headers,
445: // because they cannot exist since they are not written to the cache.
446: // So this IF should always fail..
447: cacheControl = (String) this .responseHeader
448: .get(httpHeader.PRAGMA);
449: if (cacheControl != null
450: && cacheControl.trim().toUpperCase().equals("NO-CACHE")) {
451: return false;
452: }
453:
454: // see for documentation also:
455: // http://www.web-caching.com/cacheability.html
456: // http://vancouver-webpages.com/CacheNow/
457:
458: // look for freshnes information
459: // if we don't have any freshnes indication, we treat the file as stale.
460: // no handle for freshness control:
461:
462: // -expires in cached response
463: // the expires value gives us a very easy hint when the cache is stale
464: Date expires = this .responseHeader.expires();
465: if (expires != null) {
466: // System.out.println("EXPIRES-TEST: expires=" + expires + ", NOW=" + serverDate.correctedGMTDate() + ", url=" + url);
467: if (expires.before(new Date(serverDate.correctedUTCTime()))) {
468: return false;
469: }
470: }
471: Date lastModified = this .responseHeader.lastModified();
472: cacheControl = (String) this .responseHeader
473: .get(httpHeader.CACHE_CONTROL);
474: if (cacheControl == null && lastModified == null
475: && expires == null) {
476: return false;
477: }
478:
479: // -lastModified in cached response
480: // we can apply a TTL (Time To Live) heuristic here. We call the time delta between the last read
481: // of the file and the last modified date as the age of the file. If we consider the file as
482: // middel-aged then, the maximum TTL would be cache-creation plus age.
483: // This would be a TTL factor of 100% we want no more than 10% TTL, so that a 10 month old cache
484: // file may only be treated as fresh for one more month, not more.
485: Date date = this .responseHeader.date();
486: if (lastModified != null) {
487: if (date == null) {
488: date = new Date(serverDate.correctedUTCTime());
489: }
490: long age = date.getTime() - lastModified.getTime();
491: if (age < 0) {
492: return false;
493: }
494: // TTL (Time-To-Live) is age/10 = (d2.getTime() - d1.getTime()) / 10
495: // the actual living-time is serverDate.correctedGMTDate().getTime() - d2.getTime()
496: // therefore the cache is stale, if serverDate.correctedGMTDate().getTime() - d2.getTime() > age/10
497: if (serverDate.correctedUTCTime() - date.getTime() > age / 10) {
498: return false;
499: }
500: }
501:
502: // -cache-control in cached response
503: // the cache-control has many value options.
504: if (cacheControl != null) {
505: cacheControl = cacheControl.trim().toUpperCase();
506: if (cacheControl.startsWith("PRIVATE")
507: || cacheControl.startsWith("NO-CACHE")
508: || cacheControl.startsWith("NO-STORE")) {
509: // easy case
510: return false;
511: // } else if (cacheControl.startsWith("PUBLIC")) {
512: // // ok, do nothing
513: } else if (cacheControl.startsWith("MAX-AGE=")) {
514: // we need also the load date
515: if (date == null) {
516: return false;
517: }
518: try {
519: final long ttl = 1000 * Long.parseLong(cacheControl
520: .substring(8)); // milliseconds to live
521: if (serverDate.correctedUTCTime() - date.getTime() > ttl) {
522: return false;
523: }
524: } catch (Exception e) {
525: return false;
526: }
527: }
528: }
529: return true;
530: }
531:
532: public boolean validResponseStatus(String responseStatus) {
533: return responseStatus.startsWith("200")
534: || responseStatus.startsWith("203");
535: }
536:
537: public Date ifModifiedSince() {
538: return (this .requestHeader == null) ? null : this .requestHeader
539: .ifModifiedSince();
540: }
541:
542: public boolean requestWithCookie() {
543: return (this .requestHeader == null) ? false
544: : this .requestHeader.containsKey(httpHeader.COOKIE);
545: }
546:
547: public boolean requestProhibitsIndexing() {
548: return (this .requestHeader == null) ? false
549: : this .requestHeader
550: .containsKey(httpHeader.X_YACY_INDEX_CONTROL)
551: && ((String) this .requestHeader
552: .get(httpHeader.X_YACY_INDEX_CONTROL))
553: .toUpperCase().equals("NO-INDEX");
554: }
555:
556: public httpHeader getRequestHeader() {
557: return this .requestHeader;
558: }
559:
560: public httpHeader getResponseHeader() {
561: return this.responseHeader;
562: }
563: }
|