001: /* XMLSettingsHandler
002: *
003: * $Id: XMLSettingsHandler.java 4662 2006-09-25 23:45:21Z paul_jack $
004: *
005: * Created on Dec 18, 2003
006: *
007: * Copyright (C) 2004 Internet Archive.
008: *
009: * This file is part of the Heritrix web crawler (crawler.archive.org).
010: *
011: * Heritrix is free software; you can redistribute it and/or modify
012: * it under the terms of the GNU Lesser Public License as published by
013: * the Free Software Foundation; either version 2.1 of the License, or
014: * any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019: * GNU Lesser Public License for more details.
020: *
021: * You should have received a copy of the GNU Lesser Public License
022: * along with Heritrix; if not, write to the Free Software
023: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024: */
025: package org.archive.crawler.settings;
026:
027: import java.io.BufferedInputStream;
028: import java.io.BufferedOutputStream;
029: import java.io.File;
030: import java.io.FileInputStream;
031: import java.io.FileOutputStream;
032: import java.io.IOException;
033: import java.io.InputStream;
034: import java.util.ArrayList;
035: import java.util.Collection;
036: import java.util.List;
037: import java.util.TreeSet;
038: import java.util.logging.Logger;
039:
040: import javax.management.Attribute;
041: import javax.management.AttributeNotFoundException;
042: import javax.management.InvalidAttributeValueException;
043: import javax.management.MBeanAttributeInfo;
044: import javax.management.MBeanException;
045: import javax.management.MBeanInfo;
046: import javax.management.ReflectionException;
047: import javax.xml.parsers.FactoryConfigurationError;
048: import javax.xml.parsers.ParserConfigurationException;
049: import javax.xml.parsers.SAXParserFactory;
050: import javax.xml.transform.Source;
051: import javax.xml.transform.Transformer;
052: import javax.xml.transform.TransformerFactory;
053: import javax.xml.transform.stream.StreamResult;
054:
055: import org.archive.crawler.datamodel.CrawlOrder;
056: import org.archive.util.ArchiveUtils;
057: import org.archive.util.FileUtils;
058: import org.xml.sax.InputSource;
059: import org.xml.sax.SAXException;
060: import org.xml.sax.SAXParseException;
061: import org.xml.sax.XMLReader;
062:
063: /** A SettingsHandler which uses XML files as persistent storage.
064: *
065: * @author John Erik Halse
066: */
067: public class XMLSettingsHandler extends SettingsHandler {
068: private static Logger logger = Logger
069: .getLogger("org.archive.crawler.settings.XMLSettingsHandler");
070:
071: // XML element name constants
072: protected static final String XML_SCHEMA = "heritrix_settings.xsd";
073: protected static final String XML_ROOT_ORDER = "crawl-order";
074: protected static final String XML_ROOT_HOST_SETTINGS = "crawl-settings";
075: protected static final String XML_ROOT_REFINEMENT = "crawl-refinement";
076: protected static final String XML_ELEMENT_CONTROLLER = "controller";
077: protected static final String XML_ELEMENT_META = "meta";
078: protected static final String XML_ELEMENT_NAME = "name";
079: protected static final String XML_ELEMENT_DESCRIPTION = "description";
080: protected static final String XML_ELEMENT_OPERATOR = "operator";
081: protected static final String XML_ELEMENT_ORGANIZATION = "organization";
082: protected static final String XML_ELEMENT_AUDIENCE = "audience";
083: protected static final String XML_ELEMENT_DATE = "date";
084: protected static final String XML_ELEMENT_REFINEMENTLIST = "refinement-list";
085: protected static final String XML_ELEMENT_REFINEMENT = "refinement";
086: protected static final String XML_ELEMENT_REFERENCE = "reference";
087: protected static final String XML_ELEMENT_LIMITS = "limits";
088: protected static final String XML_ELEMENT_TIMESPAN = "timespan";
089: protected static final String XML_ELEMENT_PORTNUMBER = "portnumber";
090: protected static final String XML_ELEMENT_URIMATCHES = "uri-matches";
091: protected static final String XML_ELEMENT_CONTENTMATCHES = "content-type-matches";
092: protected static final String XML_ELEMENT_OBJECT = "object";
093: protected static final String XML_ELEMENT_NEW_OBJECT = "newObject";
094: protected static final String XML_ATTRIBUTE_NAME = "name";
095: protected static final String XML_ATTRIBUTE_CLASS = "class";
096: protected static final String XML_ATTRIBUTE_FROM = "from";
097: protected static final String XML_ATTRIBUTE_TO = "to";
098:
099: private File orderFile;
100: private final static String settingsFilename = "settings";
101: private final static String settingsFilenameSuffix = "xml";
102: private final static String REFINEMENT_DIR = "_refinements";
103:
104: /** Create a new XMLSettingsHandler object.
105: *
106: * @param orderFile where the order file is located.
107: * @throws InvalidAttributeValueException
108: */
109: public XMLSettingsHandler(File orderFile)
110: throws InvalidAttributeValueException {
111: super ();
112: this .orderFile = orderFile.getAbsoluteFile();
113: }
114:
115: /** Initialize the SettingsHandler.
116: *
117: * This method builds the settings data structure and initializes it with
118: * settings from the order file given to the constructor.
119: */
120: public void initialize() {
121: super .initialize();
122: }
123:
124: /**
125: * Initialize the SettingsHandler from a source.
126: *
127: * This method builds the settings data structure and initializes it with
128: * settings from the order file given as a parameter. The intended use is
129: * to create a new order file based on a default (template) order file.
130: *
131: * @param source the order file to initialize from.
132: */
133: public void initialize(File source) {
134: File tmpOrderFile = orderFile;
135: orderFile = source.getAbsoluteFile();
136: this .initialize();
137: orderFile = tmpOrderFile;
138: }
139:
140: private File getSettingsDirectory() {
141: String settingsDirectoryName = null;
142: try {
143: settingsDirectoryName = (String) getOrder().getAttribute(
144: CrawlOrder.ATTR_SETTINGS_DIRECTORY);
145: } catch (AttributeNotFoundException e) {
146: e.printStackTrace();
147: } catch (MBeanException e) {
148: e.printStackTrace();
149: } catch (ReflectionException e) {
150: e.printStackTrace();
151: }
152:
153: return getPathRelativeToWorkingDirectory(settingsDirectoryName);
154: }
155:
156: /** Resolves the filename for a settings object into a file path.
157: *
158: * It will also create the directory structure leading to this file
159: * if it doesn't exist.
160: *
161: * @param settings the settings object to get file path for.
162: * @return the file path for this settings object.
163: */
164: protected final File settingsToFilename(CrawlerSettings settings) {
165: File file;
166:
167: if (settings.getScope() == null
168: || settings.getScope().equals("")) {
169: if (settings.isRefinement()) {
170: file = new File(getSettingsDirectory(),
171: File.separatorChar + REFINEMENT_DIR
172: + File.separatorChar
173: + settings.getName() + '.'
174: + settingsFilenameSuffix);
175: } else {
176: file = orderFile;
177: }
178: } else {
179: String elements[] = settings.getScope().split("\\.");
180: if (elements.length == 0) {
181: return orderFile;
182: }
183:
184: StringBuffer path = new StringBuffer();
185: for (int i = elements.length - 1; i > 0; i--) {
186: path.append(elements[i]);
187: path.append(File.separatorChar);
188: }
189: path.append(elements[0]);
190:
191: if (settings.isRefinement()) {
192: file = new File(getSettingsDirectory(), path.toString()
193: + File.separatorChar + REFINEMENT_DIR
194: + File.separatorChar + settings.getName() + '.'
195: + settingsFilenameSuffix);
196: } else {
197: file = new File(getSettingsDirectory(), path.toString()
198: + File.separatorChar + settingsFilename + "."
199: + settingsFilenameSuffix);
200: }
201: }
202: return file;
203: }
204:
205: public final void writeSettingsObject(CrawlerSettings settings) {
206: File filename = settingsToFilename(settings);
207: writeSettingsObject(settings, filename);
208: }
209:
210: /** Write a CrawlerSettings object to a specified file.
211: *
212: * This method is similar to {@link #writeSettingsObject(CrawlerSettings)}
213: * except that it uses the submitted File object instead of trying to
214: * resolve where the file should be written.
215: *
216: * @param settings the settings object to be serialized.
217: * @param filename the file to which the settings object should be written.
218: */
219: public final void writeSettingsObject(CrawlerSettings settings,
220: File filename) {
221:
222: logger.fine("Writing " + filename.getAbsolutePath());
223: filename.getParentFile().mkdirs();
224:
225: try {
226: long lastSaved = 0L;
227: File backup = null;
228: if (getOrder().getController() != null && filename.exists()) {
229: // The crawler is running and file exists - make backup first.
230: String name = filename.getName();
231: lastSaved = settings.getLastSavedTime().getTime();
232: name = name.substring(0, name.lastIndexOf('.')) + '_'
233: + ArchiveUtils.get14DigitDate(lastSaved) + "."
234: + settingsFilenameSuffix;
235: backup = new File(filename.getParentFile(), name);
236: FileUtils.copyFiles(filename, backup);
237: }
238:
239: StreamResult result = new StreamResult(
240: new BufferedOutputStream(new FileOutputStream(
241: filename)));
242: Transformer transformer = TransformerFactory.newInstance()
243: .newTransformer();
244: Source source = new CrawlSettingsSAXSource(settings);
245: transformer.transform(source, result);
246:
247: // Hack to get rid of unnesessary backupfiles.
248: // What happens is that the WUI often saves settings files
249: // several times during a settings change. This code removes the
250: // last backup file if its no more than 2 minutes old.
251: if (lastSaved > (System.currentTimeMillis() - 2 * 60 * 1000)) {
252: backup.delete();
253: }
254: } catch (Exception e) {
255: e.printStackTrace();
256: }
257: }
258:
259: /** Read the CrawlerSettings object from a specific file.
260: *
261: * @param settings the settings object to be updated with data from the
262: * persistent storage.
263: * @param f the file to read from.
264: * @return the updated settings object or null if there was no data for this
265: * in the persistent storage.
266: */
267: protected final CrawlerSettings readSettingsObject(
268: CrawlerSettings settings, File f) {
269: CrawlerSettings result = null;
270: try {
271: InputStream is = null;
272: if (!f.exists()) {
273: // Perhaps the file we're looking for is on the CLASSPATH.
274: // DON'T look on the CLASSPATH for 'settings.xml' files. The
275: // look for 'settings.xml' files happens frequently. Not looking
276: // on classpath for 'settings.xml' is an optimization based on
277: // ASSUMPTION that there will never be a 'settings.xml' saved
278: // on classpath.
279: if (!f.getName().startsWith(settingsFilename)) {
280: is = XMLSettingsHandler.class.getResourceAsStream(f
281: .getPath());
282: }
283: } else {
284: is = new FileInputStream(f);
285: }
286: if (is != null) {
287: XMLReader parser = SAXParserFactory.newInstance()
288: .newSAXParser().getXMLReader();
289: InputStream file = new BufferedInputStream(is);
290: parser.setContentHandler(new CrawlSettingsSAXHandler(
291: settings));
292: InputSource source = new InputSource(file);
293: source.setSystemId(f.toURL().toExternalForm());
294: parser.parse(source);
295: result = settings;
296: }
297: } catch (SAXParseException e) {
298: logger.warning(e.getMessage() + " in '" + e.getSystemId()
299: + "', line: " + e.getLineNumber() + ", column: "
300: + e.getColumnNumber());
301: } catch (SAXException e) {
302: logger.warning(e.getMessage() + ": "
303: + e.getException().getMessage());
304: } catch (ParserConfigurationException e) {
305: logger.warning(e.getMessage() + ": "
306: + e.getCause().getMessage());
307: } catch (FactoryConfigurationError e) {
308: logger.warning(e.getMessage() + ": "
309: + e.getException().getMessage());
310: } catch (IOException e) {
311: logger.warning("Could not access file '"
312: + f.getAbsolutePath() + "': " + e.getMessage());
313: }
314: return result;
315: }
316:
317: protected final CrawlerSettings readSettingsObject(
318: CrawlerSettings settings) {
319: File filename = settingsToFilename(settings);
320: return readSettingsObject(settings, filename);
321: }
322:
323: /** Get the <code>File</code> object pointing to the order file.
324: *
325: * @return File object for the order file.
326: */
327: public File getOrderFile() {
328: return orderFile;
329: }
330:
331: /** Creates a replica of the settings file structure in another directory
332: * (fully recursive, includes all per host settings). The SettingsHandler
333: * will then refer to the new files.
334: *
335: * Observe that this method should only be called after the SettingsHandler
336: * has been initialized.
337: *
338: * @param newOrderFileName where the new order file should be saved.
339: * @param newSettingsDirectory the top level directory of the per host/domain
340: * settings files.
341: * @throws IOException
342: */
343: public void copySettings(File newOrderFileName,
344: String newSettingsDirectory) throws IOException {
345: File oldSettingsDirectory = getSettingsDirectory();
346:
347: // Write new orderfile and point the settingshandler to it
348: orderFile = newOrderFileName;
349: try {
350: getOrder().setAttribute(
351: new Attribute(CrawlOrder.ATTR_SETTINGS_DIRECTORY,
352: newSettingsDirectory));
353: } catch (Exception e) {
354: throw new IOException(
355: "Could not update settings with new location: "
356: + e.getMessage());
357: }
358: writeSettingsObject(getSettingsObject(null));
359:
360: File newDir = getPathRelativeToWorkingDirectory(newSettingsDirectory);
361:
362: // Copy the per host files if src and dest directories are different.
363: if (oldSettingsDirectory.compareTo(newDir) != 0) {
364: FileUtils.copyFiles(oldSettingsDirectory, newDir);
365: }
366: }
367:
368: /**
369: * Transforms a relative path so that it is relative to the location of the
370: * order file. If an absolute path is given, it will be returned unchanged.<p>
371: * The location of it's order file is always considered as the 'working'
372: * directory for any given settings.
373: * @param path A relative path to a file (or directory)
374: * @return The same path modified so that it is relative to the file level
375: * location of the order file for the settings handler.
376: */
377: public File getPathRelativeToWorkingDirectory(String path) {
378: File f = new File(path);
379: // If path is not absolute, set f's directory
380: // relative to the path of the order file
381: if (!f.isAbsolute()) {
382: f = new File(this .getOrderFile().getParent(), path);
383: }
384: return f;
385: }
386:
387: public Collection getDomainOverrides(String rootDomain) {
388: File settingsDir = getSettingsDirectory();
389:
390: //Find the right start directory.
391: ArrayList<String> domains = new ArrayList<String>();
392: //First we deconstruct the rootDomain string
393: while (rootDomain != null && rootDomain.length() > 0) {
394: if (rootDomain.indexOf('.') < 0) {
395: // Last level.
396: domains.add(rootDomain);
397: break; //We're done.
398: } else {
399: // Got more then one level left.
400: domains.add(rootDomain.substring(0, rootDomain
401: .indexOf('.')));
402: // Strip down rootDomain.
403: rootDomain = rootDomain.substring(rootDomain
404: .indexOf('.') + 1);
405: }
406: }
407: //Build up a proper path
408: //Since the domains are right to left, we start at the end of the array.
409: StringBuffer subDir = new StringBuffer();
410: for (int i = (domains.size() - 1); i >= 0; i--) {
411: subDir.append(File.separator + domains.get(i));
412: }
413: //Then we move to the approprite directory.
414: settingsDir = new File(settingsDir.getPath() + subDir);
415: TreeSet<String> confirmedSubDomains = new TreeSet<String>();
416: if (settingsDir.exists()) {
417: // Found our place! Search through it's subdirs.
418: File[] possibleSubDomains = settingsDir.listFiles();
419: for (int i = 0; i < possibleSubDomains.length; i++) {
420: if (possibleSubDomains[i].isDirectory()
421: && isOverride(possibleSubDomains[i])) {
422: // Found one!
423: confirmedSubDomains.add(possibleSubDomains[i]
424: .getName());
425: }
426: }
427: }
428: return confirmedSubDomains;
429: }
430:
431: /**
432: * Checks if a file is a a 'per host' override or if it's a directory if it
433: * or it's subdirectories contains a 'per host' override file.
434: * @param f The file or directory to check
435: * @return True if the file is an override or it's a directory that contains
436: * such a file.
437: */
438: private boolean isOverride(File f) {
439: if (f.isDirectory()) {
440: // Have a directory, check it's contents.
441: File[] subs = f.listFiles();
442: for (int i = 0; i < subs.length; i++) {
443: if (isOverride(subs[i])) {
444: // Found one. Can stop looking.
445: return true;
446: }
447: }
448: } else if (f.getName().equals(
449: settingsFilename + "." + settingsFilenameSuffix)) {
450: // This is an override file (or sure looks like one in any case).
451: return true;
452: }
453: // Didn't find an override.
454: return false;
455: }
456:
457: /** Delete a settings object from persistent storage.
458: *
459: * Deletes the file represented by the submitted settings object. All empty
460: * directories that are parents to the files path are also deleted.
461: *
462: * @param settings the settings object to delete.
463: */
464: public void deleteSettingsObject(CrawlerSettings settings) {
465: super .deleteSettingsObject(settings);
466: File settingsDirectory = getSettingsDirectory();
467: File settingsFile = settingsToFilename(settings);
468:
469: settingsFile.delete();
470: settingsFile = settingsFile.getParentFile();
471: while (settingsFile.isDirectory()
472: && settingsFile.list().length == 0
473: && !settingsFile.equals(settingsDirectory)) {
474: settingsFile.delete();
475: settingsFile = settingsFile.getParentFile();
476: }
477: }
478:
479: /* (non-Javadoc)
480: * @see org.archive.crawler.settings.SettingsHandler#getListOfAllFiles()
481: */
482: public List<String> getListOfAllFiles() {
483: ArrayList<String> list = new ArrayList<String>();
484: // Add CrawlOrder.
485: list.add(getOrderFile().getAbsolutePath());
486: // Iterate through the entire override hierarchy
487: if (getSettingsDirectory().exists()) {
488: recursiveFindFiles(getSettingsDirectory(), list);
489: }
490: // Get files used by settings modules.
491: recursiveFindSecondaryFiles(getOrder(), list);
492: return list;
493: }
494:
495: /**
496: * Add any files being used by any of the Modules making up the settings to
497: * the list.
498: *
499: * @param mbean A ModuleType to interrogate for files. Any child modules
500: * will be recursively interrogated.
501: * @param list The list to add found files to.
502: */
503: private void recursiveFindSecondaryFiles(ComplexType mbean,
504: ArrayList<String> list) {
505: MBeanInfo info = mbean.getMBeanInfo();
506: MBeanAttributeInfo[] a = info.getAttributes();
507: // Interrogate the current module
508: if (mbean instanceof ModuleType) {
509: ((ModuleType) mbean).listUsedFiles(list);
510: }
511:
512: // Recursively interrogate all sub modules that are of ModuleType
513: for (int n = 0; n < a.length; n++) {
514: if (a[n] == null) {
515: // Error null attribute.
516: } else {
517: ModuleAttributeInfo att = (ModuleAttributeInfo) a[n];
518: Object currentAttribute;
519: try {
520: currentAttribute = mbean
521: .getAttribute(att.getName());
522: if (currentAttribute instanceof ComplexType) {
523: recursiveFindSecondaryFiles(
524: (ComplexType) currentAttribute, list);
525: }
526: } catch (AttributeNotFoundException e) {
527: // TODO Auto-generated catch block
528: e.printStackTrace();
529: } catch (MBeanException e) {
530: // TODO Auto-generated catch block
531: e.printStackTrace();
532: } catch (ReflectionException e) {
533: // TODO Auto-generated catch block
534: e.printStackTrace();
535: }
536: }
537: }
538: }
539:
540: /**
541: * Starting at the specific directory this method will iterate through all
542: * sub directories and add each file (as absolute name, with path as a
543: * string) to the provided ArrayList. Any file found under the settings
544: * directory with the proper suffix will be considered valid and added to
545: * the list.
546: * @param dir Starting directory
547: * @param list The list to add to
548: */
549: private void recursiveFindFiles(File dir, ArrayList<String> list) {
550: File[] subs = dir.listFiles();
551: if (subs != null) {
552: for (int i = 0; i < subs.length; i++) {
553: if (subs[i].isDirectory()) {
554: recursiveFindFiles(subs[i], list);
555: } else {
556: if (subs[i].getName().endsWith(
557: settingsFilenameSuffix)) {
558: // Add it to list
559: list.add(subs[i].getAbsolutePath());
560: }
561: }
562: }
563: }
564: }
565: }
|