001: /* SettingsHandler
002: *
003: * $Id: SettingsHandler.java 4662 2006-09-25 23:45:21Z paul_jack $
004: *
005: * Created on Dec 16, 2003
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.settings;
026:
027: import java.io.File;
028: import java.lang.reflect.Constructor;
029: import java.lang.reflect.InvocationTargetException;
030: import java.text.ParseException;
031: import java.util.Collection;
032: import java.util.Collections;
033: import java.util.HashMap;
034: import java.util.HashSet;
035: import java.util.Iterator;
036: import java.util.List;
037: import java.util.Map;
038: import java.util.Set;
039: import java.util.logging.Level;
040:
041: import javax.management.AttributeNotFoundException;
042: import javax.management.InvalidAttributeValueException;
043:
044: import org.archive.crawler.datamodel.CrawlOrder;
045: import org.archive.crawler.settings.refinements.Refinement;
046: import org.archive.net.UURI;
047: import org.archive.util.ArchiveUtils;
048:
049: /** An instance of this class holds a hierarchy of settings.
050: *
051: * More than one instance in memory is allowed so that a new CrawlJob could
052: * be configured while another job is running.
053: *
054: * This class should be subclassed to adapt to a persistent storage.
055: *
056: * @author John Erik Halse
057: */
058: public abstract class SettingsHandler {
059: /** Cached CrawlerSettings objects */
060: private SettingsCache settingsCache = new SettingsCache(
061: new CrawlerSettings(this , null));
062:
063: /** Reference to the order module */
064: private CrawlOrder order;
065:
066: private Set<ValueErrorHandler> valueErrorHandlers = Collections
067: .synchronizedSet(new HashSet<ValueErrorHandler>());
068: private int errorReportingLevel = Level.ALL.intValue();
069:
070: /** Datatypes supported by the settings framwork */
071: final static String INTEGER = "integer";
072: final static String LONG = "long";
073: final static String FLOAT = "float";
074: final static String DOUBLE = "double";
075: final static String BOOLEAN = "boolean";
076: final static String STRING = "string";
077: final static String TEXT = "text";
078: final static String OBJECT = "object";
079: final static String TIMESTAMP = "timestamp";
080: final static String MAP = "map";
081: final static String INTEGER_LIST = "integerList";
082: final static String LONG_LIST = "longList";
083: final static String FLOAT_LIST = "floatList";
084: final static String DOUBLE_LIST = "doubleList";
085: final static String STRING_LIST = "stringList";
086: private final static String names[][] = new String[][] {
087: { INTEGER, "java.lang.Integer" },
088: { LONG, "java.lang.Long" },
089: { FLOAT, "java.lang.Float" },
090: { DOUBLE, "java.lang.Double" },
091: { BOOLEAN, "java.lang.Boolean" },
092: { STRING, "java.lang.String" },
093: { TEXT, "org.archive.crawler.settings.TextField" },
094: { OBJECT, "org.archive.crawler.settings.ModuleType" },
095: { TIMESTAMP, "java.util.Date" },
096: { MAP, "org.archive.crawler.settings.MapType" },
097: { INTEGER_LIST, "org.archive.crawler.settings.IntegerList" },
098: { LONG_LIST, "org.archive.crawler.settings.LongList" },
099: { FLOAT_LIST, "org.archive.crawler.settings.FloatList" },
100: { DOUBLE_LIST, "org.archive.crawler.settings.DoubleList" },
101: { STRING_LIST, "org.archive.crawler.settings.StringList" } };
102: private final static Map<String, String> name2class = new HashMap<String, String>();
103: private final static Map<String, String> class2name = new HashMap<String, String>();
104: static {
105: for (int i = 0; i < names.length; i++) {
106: name2class.put(names[i][0], names[i][1]);
107: class2name.put(names[i][1], names[i][0]);
108: }
109: }
110:
111: /** Create a new SettingsHandler object.
112: *
113: * @throws InvalidAttributeValueException
114: */
115: public SettingsHandler() throws InvalidAttributeValueException {
116: order = new CrawlOrder();
117: order.setAsOrder(this );
118: }
119:
120: /** Initialize the SettingsHandler.
121: *
122: * This method reads the default settings from the persistent storage.
123: */
124: public void initialize() {
125: readSettingsObject(settingsCache.getGlobalSettings());
126: }
127:
128: public void cleanup() {
129: this .settingsCache = null;
130: if (this .order != null) {
131: this .order.setController(null);
132: }
133: this .order = null;
134: }
135:
136: /** Strip off the leftmost part of a domain name.
137: *
138: * @param scope the domain name.
139: * @return scope with everything before the first dot ripped off.
140: */
141: protected String getParentScope(String scope) {
142: int split = scope.indexOf('.');
143: return (split == -1) ? null : scope.substring(split + 1);
144: }
145:
146: /** Get a module by name.
147: *
148: * All modules in the order should have unique names. This method makes it
149: * possible to get the modules of the order by its name.
150: *
151: * @param name the modules name.
152: * @return the module the name references.
153: */
154: public ModuleType getModule(String name) {
155: return settingsCache.getGlobalSettings().getModule(name);
156: }
157:
158: /** Get a complex type by its absolute name.
159: *
160: * The absolute name is the complex types name and the path leading to
161: * it.
162: *
163: * @param settings the settings object to query.
164: * @param absoluteName the absolute name of the complex type to get.
165: * @return the complex type referenced by the absolute name or null if
166: * the complex type could not be found in this settings object.
167: * @throws AttributeNotFoundException is thrown if no ComplexType by this
168: * name exist.
169: */
170: public ComplexType getComplexTypeByAbsoluteName(
171: CrawlerSettings settings, String absoluteName)
172: throws AttributeNotFoundException {
173:
174: settings = settings == null ? settingsCache.getGlobalSettings()
175: : settings;
176:
177: DataContainer data = settings.getData(absoluteName);
178: if (data == null) {
179: CrawlerSettings parentSettings = settings.getParent();
180: if (parentSettings == null) {
181: throw new AttributeNotFoundException(absoluteName);
182: }
183: return getComplexTypeByAbsoluteName(parentSettings,
184: absoluteName);
185: }
186: return data.getComplexType();
187: }
188:
189: protected static String getTypeName(String className) {
190: return (String) class2name.get(className);
191: }
192:
193: protected static String getClassName(String typeName) {
194: return (String) name2class.get(typeName);
195: }
196:
197: /** Convert a String object to an object of <code>typeName</code>.
198: *
199: * @param stringValue string to convert.
200: * @param typeName type to convert to. typeName should be one of the
201: * supported types represented by constants in this class.
202: * @return the new value object.
203: * @throws ClassCastException is thrown if string could not be converted.
204: */
205: protected static Object StringToType(String stringValue,
206: String typeName) {
207: Object value;
208: if (typeName == SettingsHandler.STRING) {
209: value = stringValue;
210: } else if (typeName == SettingsHandler.TEXT) {
211: value = new TextField(stringValue);
212: } else if (typeName == SettingsHandler.INTEGER) {
213: value = Integer.decode(stringValue);
214: } else if (typeName == SettingsHandler.LONG) {
215: value = Long.decode(stringValue);
216: } else if (typeName == SettingsHandler.BOOLEAN) {
217: value = Boolean.valueOf(stringValue);
218: } else if (typeName == SettingsHandler.DOUBLE) {
219: value = Double.valueOf(stringValue);
220: } else if (typeName == SettingsHandler.FLOAT) {
221: value = Float.valueOf(stringValue);
222: } else if (typeName == SettingsHandler.TIMESTAMP) {
223: try {
224: value = ArchiveUtils.parse14DigitDate(stringValue);
225: } catch (ParseException e) {
226: throw new ClassCastException("Cannot convert '"
227: + stringValue + "' to type '" + typeName + "'");
228: }
229: } else {
230: throw new ClassCastException("Cannot convert '"
231: + stringValue + "' to type '" + typeName + "'");
232: }
233: return value;
234: }
235:
236: /** Get CrawlerSettings object in effect for a host or domain.
237: *
238: * If there is no specific settings for the host/domain, it will recursively
239: * go up the hierarchy to find the settings object that should be used for
240: * this host/domain.
241: *
242: * @param host the host or domain to get the settings for.
243: * @return settings object in effect for the host/domain.
244: * @see #getSettingsObject(String)
245: * @see #getOrCreateSettingsObject(String)
246: */
247: public CrawlerSettings getSettings(String host) {
248: return getRefinementsForSettings(getSettingsForHost(host), null);
249: }
250:
251: /** Get CrawlerSettings object in effect for a host or domain.
252: *
253: * If there is no specific settings for the host/domain, it will recursively
254: * go up the hierarchy to find the settings object that should be used for
255: * this host/domain.
256: * <p/>
257: * This method passes around a URI that refinement are checked against.
258: *
259: * @param host the host or domain to get the settings for.
260: * @param uuri UURI for context.
261: * @return settings object in effect for the host/domain.
262: * @see #getSettingsObject(String)
263: * @see #getOrCreateSettingsObject(String)
264: */
265: public CrawlerSettings getSettings(String host, UURI uuri) {
266: return getRefinementsForSettings(getSettingsForHost(host), uuri);
267: }
268:
269: protected CrawlerSettings getSettingsForHost(String host) {
270: CrawlerSettings settings = settingsCache
271: .getSettings(host, null);
272:
273: if (settings == null) {
274: String tmpHost = host;
275: settings = getSettingsObject(tmpHost);
276: while (settings == null && tmpHost != null) {
277: tmpHost = getParentScope(tmpHost);
278: settings = getSettingsObject(tmpHost);
279: }
280:
281: settingsCache.putSettings(host, settings);
282: }
283:
284: return settings;
285: }
286:
287: private CrawlerSettings getRefinementsForSettings(
288: CrawlerSettings settings, UURI uri) {
289: if (settings.hasRefinements()) {
290: for (Iterator it = settings.refinementsIterator(); it
291: .hasNext();) {
292: Refinement refinement = (Refinement) it.next();
293: if (refinement.isWithinRefinementBounds(uri)) {
294: settings = getSettingsObject(settings.getScope(),
295: refinement.getReference());
296: }
297: }
298: }
299:
300: return settings;
301: }
302:
303: /** Get CrawlerSettings object for a host or domain.
304: *
305: * The difference between this method and the
306: * <code>getSettings(String host)</code> is that this method will return
307: * null if there is no settings for particular host or domain.
308: *
309: * @param scope the host or domain to get the settings for.
310: * @return settings object for the host/domain or null if no
311: * settings exist for the host/domain.
312: * @see #getSettings(String)
313: * @see #getOrCreateSettingsObject(String)
314: */
315: public CrawlerSettings getSettingsObject(String scope) {
316: return getSettingsObject(scope, null);
317: }
318:
319: /**
320: * Get CrawlerSettings object for a host/domain and a particular refinement.
321: *
322: * @param scope the host or domain to get the settings for.
323: * @param refinement the refinement reference to get.
324: * @return CrawlerSettings object for a host/domain and a particular
325: * refinement or null if no settings exist for the host/domain.
326: */
327: public CrawlerSettings getSettingsObject(String scope,
328: String refinement) {
329: CrawlerSettings settings = settingsCache.getSettingsObject(
330: scope, refinement);
331:
332: if (settings == null) {
333: // Reference not found
334: settings = new CrawlerSettings(this , scope, refinement);
335: // Try to read settings from persisten storage. If its not there
336: // it will be set to null.
337: settings = readSettingsObject(settings);
338: if (settings != null) {
339: settingsCache.putSettings(scope, settings);
340: }
341: }
342: return settings;
343: }
344:
345: /** Get or create CrawlerSettings object for a host or domain.
346: *
347: * This method is similar to {@link #getSettingsObject(String)} except that
348: * if there is no settings for this particular host or domain a new settings
349: * object will be returned.
350: *
351: * @param scope the host or domain to get or create the settings for.
352: * @return settings object for the host/domain.
353: * @see #getSettings(String)
354: * @see #getSettingsObject(String)
355: */
356: public CrawlerSettings getOrCreateSettingsObject(String scope) {
357: return getOrCreateSettingsObject(scope, null);
358: }
359:
360: public CrawlerSettings getOrCreateSettingsObject(String scope,
361: String refinement) {
362: CrawlerSettings settings;
363: settings = getSettingsObject(scope, refinement);
364: if (settings == null) {
365: scope = scope.intern();
366:
367: // No existing settings object found, create one
368: settings = new CrawlerSettings(this , scope, refinement);
369: settingsCache.refreshHostToSettings();
370: settingsCache.putSettings(scope, settings);
371: }
372: return settings;
373: }
374:
375: /** Write the CrawlerSettings object to persistent storage.
376: *
377: * @param settings the settings object to write.
378: */
379: public abstract void writeSettingsObject(CrawlerSettings settings);
380:
381: /** Read the CrawlerSettings object from persistent storage.
382: *
383: * @param settings the settings object to be updated with data from the
384: * persistent storage.
385: * @return the updated settings object or null if there was no data for this
386: * in the persistent storage.
387: */
388: protected abstract CrawlerSettings readSettingsObject(
389: CrawlerSettings settings);
390:
391: /** Delete a settings object from persistent storage.
392: *
393: * @param settings the settings object to delete.
394: */
395: public void deleteSettingsObject(CrawlerSettings settings) {
396: settingsCache.deleteSettingsObject(settings);
397: }
398:
399: /** Get the CrawlOrder.
400: *
401: * @return the CrawlOrder
402: */
403: public CrawlOrder getOrder() {
404: return order;
405: }
406:
407: /** Instatiate a new ModuleType given its name and className.
408: *
409: * @param name the name for the new ComplexType.
410: * @param className the class name of the new ComplexType.
411: * @return an instance of the class identified by className.
412: *
413: * @throws InvocationTargetException
414: */
415: public static ModuleType instantiateModuleTypeFromClassName(
416: String name, String className)
417: throws InvocationTargetException {
418:
419: Class cl;
420: try {
421: cl = Class.forName(className);
422: } catch (ClassNotFoundException e) {
423: throw new InvocationTargetException(e);
424: }
425:
426: ModuleType module;
427: try {
428: Constructor co = cl
429: .getConstructor(new Class[] { String.class });
430: module = (ModuleType) co.newInstance(new Object[] { name });
431: } catch (IllegalArgumentException e) {
432: throw new InvocationTargetException(e);
433: } catch (InstantiationException e) {
434: throw new InvocationTargetException(e);
435: } catch (IllegalAccessException e) {
436: throw new InvocationTargetException(e);
437: } catch (SecurityException e) {
438: throw new InvocationTargetException(e);
439: } catch (NoSuchMethodException e) {
440: throw new InvocationTargetException(e);
441: }
442: return module;
443: }
444:
445: /**
446: * Transforms a relative path so that it is relative to a location that is
447: * regarded as a working dir for these settings. If an absolute path is given,
448: * it will be returned unchanged.
449: * @param path A relative path to a file (or directory)
450: * @return The same path modified so that it is relative to the file level
451: * location that is considered the working directory for these settings.
452: */
453: public abstract File getPathRelativeToWorkingDirectory(String path);
454:
455: /**
456: * Will return a Collection of strings with domains that contain 'per'
457: * domain overrides (or their subdomains contain them).
458: *
459: * The domains considered are
460: * limited to those that are subdomains of the supplied domain. If null or
461: * empty string is supplied the TLDs will be considered.
462: * @param rootDomain The domain to get domain overrides for. Examples:
463: * 'org', 'archive.org', 'crawler.archive.org' etc.
464: * @return An array of domains that contain overrides. If rootDomain does not
465: * exist an empty array will be returned.
466: */
467: public abstract Collection getDomainOverrides(String rootDomain);
468:
469: /**
470: * Unregister an instance of {@link ValueErrorHandler}.
471: *
472: * @param errorHandler the <code>CalueErrorHandler</code> to unregister.
473: *
474: * @see ValueErrorHandler
475: * @see #setErrorReportingLevel(Level)
476: * @see #registerValueErrorHandler(ValueErrorHandler)
477: *
478: */
479: public void unregisterValueErrorHandler(
480: ValueErrorHandler errorHandler) {
481: valueErrorHandlers.remove(errorHandler);
482: }
483:
484: /**
485: * Register an instance of {@link ValueErrorHandler}.
486: * <p>
487: * If a ValueErrorHandler is registered, only constraints with level
488: * {@link Level#SEVERE}will throw an {@link InvalidAttributeValueException}.
489: * The ValueErrorHandler will recieve a notification for all failed checks
490: * with level equal or greater than the error reporting level.
491: *
492: * @param errorHandler the <code>CalueErrorHandler</code> to register.
493: *
494: * @see ValueErrorHandler
495: * @see #setErrorReportingLevel(Level)
496: * @see #unregisterValueErrorHandler(ValueErrorHandler)
497: */
498: public void registerValueErrorHandler(ValueErrorHandler errorHandler) {
499: if (errorHandler != null) {
500: valueErrorHandlers.add(errorHandler);
501: }
502: }
503:
504: /**
505: * Fire events on all registered {@link ValueErrorHandler}.
506: *
507: * @param error the failed constraints return value.
508: * @return true if there was any registered ValueErrorHandlers to notify.
509: */
510: boolean fireValueErrorHandlers(Constraint.FailedCheck error) {
511: if (error.getLevel().intValue() >= errorReportingLevel) {
512: for (Iterator it = valueErrorHandlers.iterator(); it
513: .hasNext();) {
514: ((ValueErrorHandler) it.next()).handleValueError(error);
515: }
516: }
517: return valueErrorHandlers.size() > 0;
518: }
519:
520: /**
521: * Set the level for which notification of failed constraints will be fired.
522: *
523: * @param level the error reporting level.
524: */
525: public void setErrorReportingLevel(Level level) {
526: errorReportingLevel = level.intValue();
527: }
528:
529: /**
530: * Creates and returns a <tt>List</tt> of all files comprising the current
531: * settings framework.
532: *
533: * <p>The List contains the absolute String path of each file.
534: *
535: * <p>The list should contain any configurable files, including such files
536: * as seed file and any other files use by the various settings modules.
537: *
538: * <p>Implementations of the SettingsHandler that do not use files for
539: * permanent storage should return an empty list.
540: * @return <code>List</code> of framework files.
541: */
542: public abstract List getListOfAllFiles();
543:
544: /**
545: * Clear any per-host settings cached in memory; allows editting of
546: * per-host settings files on disk, perhaps in bulk/automated fashion,
547: * to take effect in running crawl.
548: */
549: public void clearPerHostSettingsCache() {
550: settingsCache.clear();
551: }
552: }
|