001: /*
002: * CrawlSettingsSAXHandler
003: *
004: * $Id: CrawlSettingsSAXHandler.java 5111 2007-05-03 01:43:43Z gojomo $
005: *
006: * Created on Dec 8, 2003
007: *
008: * Copyright (C) 2004 Internet Archive.
009: *
010: * This file is part of the Heritrix web crawler (crawler.archive.org).
011: *
012: * Heritrix is free software; you can redistribute it and/or modify it under the
013: * terms of the GNU Lesser Public License as published by the Free Software
014: * Foundation; either version 2.1 of the License, or any later version.
015: *
016: * Heritrix is distributed in the hope that it will be useful, but WITHOUT ANY
017: * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
018: * A PARTICULAR PURPOSE. See the GNU Lesser Public License for more details.
019: *
020: * You should have received a copy of the GNU Lesser Public License along with
021: * Heritrix; if not, write to the Free Software Foundation, Inc., 59 Temple
022: * Place, Suite 330, Boston, MA 02111-1307 USA
023: */
024: package org.archive.crawler.settings;
025:
026: import java.lang.reflect.InvocationTargetException;
027: import java.text.ParseException;
028: import java.util.HashMap;
029: import java.util.Map;
030: import java.util.Stack;
031: import java.util.logging.Level;
032: import java.util.logging.Logger;
033:
034: import javax.management.Attribute;
035: import javax.management.AttributeNotFoundException;
036: import javax.management.InvalidAttributeValueException;
037:
038: import org.archive.crawler.settings.Constraint.FailedCheck;
039: import org.archive.crawler.settings.refinements.PortnumberCriteria;
040: import org.archive.crawler.settings.refinements.Refinement;
041: import org.archive.crawler.settings.refinements.RegularExpressionCriteria;
042: import org.archive.crawler.settings.refinements.TimespanCriteria;
043: import org.archive.util.ArchiveUtils;
044: import org.xml.sax.Attributes;
045: import org.xml.sax.Locator;
046: import org.xml.sax.SAXException;
047: import org.xml.sax.SAXParseException;
048: import org.xml.sax.helpers.DefaultHandler;
049:
050: /**
051: * An SAX element handler that updates a CrawlerSettings object.
052: *
053: * This is a helper class for the XMLSettingsHandler.
054: *
055: * @author John Erik Halse
056: */
057: public class CrawlSettingsSAXHandler extends DefaultHandler implements
058: ValueErrorHandler {
059:
060: private static Logger logger = Logger
061: .getLogger("org.archive.crawler.settings.XMLSettingsHandler");
062:
063: private Locator locator;
064:
065: private CrawlerSettings settings;
066:
067: private SettingsHandler settingsHandler;
068:
069: private Map<String, ElementHandler> handlers = new HashMap<String, ElementHandler>();
070:
071: private Stack<ElementHandler> handlerStack = new Stack<ElementHandler>();
072:
073: private Stack<Object> stack = new Stack<Object>();
074:
075: /** Keeps track of elements which subelements should be skipped. */
076: private Stack<Boolean> skip = new Stack<Boolean>();
077:
078: private StringBuffer buffer = new StringBuffer();
079:
080: private String value;
081:
082: /**
083: * Creates a new CrawlSettingsSAXHandler.
084: *
085: * @param settings the settings object that should be updated from this
086: * handler.
087: */
088: public CrawlSettingsSAXHandler(CrawlerSettings settings) {
089: super ();
090: this .settings = settings;
091: this .settingsHandler = settings.getSettingsHandler();
092: handlers.put(XMLSettingsHandler.XML_ROOT_ORDER,
093: new RootHandler());
094: handlers.put(XMLSettingsHandler.XML_ROOT_HOST_SETTINGS,
095: new RootHandler());
096: handlers.put(XMLSettingsHandler.XML_ROOT_REFINEMENT,
097: new RootHandler());
098: handlers.put(XMLSettingsHandler.XML_ELEMENT_CONTROLLER,
099: new ModuleHandler());
100: handlers.put(XMLSettingsHandler.XML_ELEMENT_OBJECT,
101: new ModuleHandler());
102: handlers.put(XMLSettingsHandler.XML_ELEMENT_NEW_OBJECT,
103: new NewModuleHandler());
104: handlers.put(XMLSettingsHandler.XML_ELEMENT_META,
105: new MetaHandler());
106: handlers.put(XMLSettingsHandler.XML_ELEMENT_NAME,
107: new NameHandler());
108: handlers.put(XMLSettingsHandler.XML_ELEMENT_DESCRIPTION,
109: new DescriptionHandler());
110: handlers.put(XMLSettingsHandler.XML_ELEMENT_OPERATOR,
111: new OperatorHandler());
112: handlers.put(XMLSettingsHandler.XML_ELEMENT_ORGANIZATION,
113: new OrganizationHandler());
114: handlers.put(XMLSettingsHandler.XML_ELEMENT_AUDIENCE,
115: new AudienceHandler());
116: handlers.put(XMLSettingsHandler.XML_ELEMENT_DATE,
117: new DateHandler());
118: handlers.put(SettingsHandler.MAP, new MapHandler());
119: handlers.put(SettingsHandler.INTEGER_LIST, new ListHandler());
120: handlers.put(SettingsHandler.STRING_LIST, new ListHandler());
121: handlers.put(SettingsHandler.DOUBLE_LIST, new ListHandler());
122: handlers.put(SettingsHandler.FLOAT_LIST, new ListHandler());
123: handlers.put(SettingsHandler.LONG_LIST, new ListHandler());
124: handlers
125: .put(SettingsHandler.STRING, new SimpleElementHandler());
126: handlers.put(SettingsHandler.TEXT, new SimpleElementHandler());
127: handlers.put(SettingsHandler.INTEGER,
128: new SimpleElementHandler());
129: handlers.put(SettingsHandler.FLOAT, new SimpleElementHandler());
130: handlers.put(SettingsHandler.LONG, new SimpleElementHandler());
131: handlers.put(SettingsHandler.BOOLEAN,
132: new SimpleElementHandler());
133: handlers
134: .put(SettingsHandler.DOUBLE, new SimpleElementHandler());
135:
136: handlers.put(XMLSettingsHandler.XML_ELEMENT_REFINEMENTLIST,
137: new RefinementListHandler());
138: handlers.put(XMLSettingsHandler.XML_ELEMENT_REFINEMENT,
139: new RefinementHandler());
140: handlers.put(XMLSettingsHandler.XML_ELEMENT_REFERENCE,
141: new ReferenceHandler());
142: handlers.put(XMLSettingsHandler.XML_ELEMENT_LIMITS,
143: new LimitsHandler());
144: handlers.put(XMLSettingsHandler.XML_ELEMENT_TIMESPAN,
145: new TimespanHandler());
146: handlers.put(XMLSettingsHandler.XML_ELEMENT_PORTNUMBER,
147: new PortnumberHandler());
148: handlers.put(XMLSettingsHandler.XML_ELEMENT_URIMATCHES,
149: new URIMatcherHandler());
150: }
151:
152: /*
153: * (non-Javadoc)
154: *
155: * @see org.xml.sax.ContentHandler#setDocumentLocator(org.xml.sax.Locator)
156: */
157: public void setDocumentLocator(Locator locator) {
158: super .setDocumentLocator(locator);
159: this .locator = locator;
160: }
161:
162: /*
163: * (non-Javadoc)
164: *
165: * @see org.xml.sax.ContentHandler#startDocument()
166: */
167: public void startDocument() throws SAXException {
168: settingsHandler.registerValueErrorHandler(this );
169: skip.push(new Boolean(false));
170: super .startDocument();
171: }
172:
173: /*
174: * (non-Javadoc)
175: *
176: * @see org.xml.sax.ContentHandler#endDocument()
177: */
178: public void endDocument() throws SAXException {
179: settingsHandler.unregisterValueErrorHandler(this );
180: super .endDocument();
181: }
182:
183: /*
184: * (non-Javadoc)
185: *
186: * @see org.xml.sax.ContentHandler#characters(char[], int, int)
187: */
188: public void characters(char[] ch, int start, int length)
189: throws SAXException {
190: super .characters(ch, start, length);
191: buffer.append(ch, start, length);
192: }
193:
194: /**
195: * Start of an element. Decide what handler to use, and call it.
196: *
197: * @param uri
198: * @param localName
199: * @param qName
200: * @param attributes
201: * @throws SAXException
202: */
203: public void startElement(String uri, String localName,
204: String qName, Attributes attributes) throws SAXException {
205:
206: ElementHandler handler = ((ElementHandler) handlers.get(qName));
207: if (handler != null) {
208: handlerStack.push(handler);
209:
210: if (((Boolean) skip.peek()).booleanValue()) {
211: skip.push(new Boolean(true));
212: String moduleName = attributes
213: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
214: logger.fine("Skipping: " + qName + " " + moduleName);
215: } else {
216: try {
217: handler.startElement(qName, attributes);
218: skip.push(new Boolean(false));
219: } catch (SAXException e) {
220: if (e.getException() instanceof InvocationTargetException
221: || e.getException() instanceof AttributeNotFoundException) {
222: skip.push(new Boolean(true));
223: } else {
224: skip.push(new Boolean(false));
225: throw e;
226: }
227: }
228: }
229: } else {
230: String tmp = "Unknown element '" + qName + "' in '"
231: + locator.getSystemId() + "', line: "
232: + locator.getLineNumber() + ", column: "
233: + locator.getColumnNumber();
234: if (this .settingsHandler.getOrder() != null
235: && this .settingsHandler.getOrder().getController() != null) {
236: logger.log(Level.WARNING, tmp);
237: }
238: logger.warning(tmp);
239: }
240: }
241:
242: /**
243: * End of an element.
244: *
245: * @param uri
246: * @param localName
247: * @param qName
248: * @throws SAXException
249: */
250: public void endElement(String uri, String localName, String qName)
251: throws SAXException {
252: value = buffer.toString().trim();
253: buffer.setLength(0);
254: ElementHandler handler = (ElementHandler) handlerStack.pop();
255: if (!((Boolean) skip.pop()).booleanValue()) {
256: if (handler != null) {
257: handler.endElement(qName);
258: }
259: }
260: }
261:
262: public void illegalElementError(String name)
263: throws SAXParseException {
264: throw new SAXParseException("Element '" + name
265: + "' not allowed here", locator);
266: }
267:
268: /**
269: * Superclass of all the elementhandlers.
270: *
271: * This class should be subclassed for the different XML-elements.
272: *
273: * @author John Erik Halse
274: */
275: private class ElementHandler {
276:
277: /**
278: * Start of an element
279: *
280: * @param name
281: * @param atts
282: * @throws SAXException
283: */
284: public void startElement(String name, Attributes atts)
285: throws SAXException {
286: }
287:
288: /**
289: * End of an element
290: *
291: * @param name
292: * @throws SAXException
293: */
294: public void endElement(String name) throws SAXException {
295: }
296: }
297:
298: /**
299: * Handle the root element.
300: *
301: * This class checks that the root element is of the right type.
302: *
303: * @author John Erik Halse
304: */
305: private class RootHandler extends ElementHandler {
306:
307: public void startElement(String name, Attributes atts)
308: throws SAXException {
309: // Check filetype
310: if ((name.equals(XMLSettingsHandler.XML_ROOT_ORDER) && settings
311: .getScope() != null)
312: || (name
313: .equals(XMLSettingsHandler.XML_ROOT_HOST_SETTINGS) && settings
314: .getScope() == null)
315: || (name
316: .equals(XMLSettingsHandler.XML_ROOT_REFINEMENT) && !settings
317: .isRefinement())) {
318: throw new SAXParseException("Wrong document type '"
319: + name + "'", locator);
320: }
321: }
322: }
323:
324: // Meta handlers
325: private class MetaHandler extends ElementHandler {
326: }
327:
328: private class NameHandler extends ElementHandler {
329:
330: public void endElement(String name) throws SAXException {
331: if (handlerStack.peek() instanceof MetaHandler) {
332: settings.setName(value);
333: } else {
334: illegalElementError(name);
335: }
336: }
337: }
338:
339: private class DescriptionHandler extends ElementHandler {
340:
341: public void endElement(String name) throws SAXException {
342: if (handlerStack.peek() instanceof MetaHandler) {
343: settings.setDescription(value);
344: } else if (handlerStack.peek() instanceof RefinementHandler) {
345: ((Refinement) stack.peek()).setDescription(value);
346: } else {
347: illegalElementError(name);
348: }
349: }
350: }
351:
352: private class OrganizationHandler extends ElementHandler {
353:
354: public void endElement(String name) throws SAXException {
355: if (handlerStack.peek() instanceof MetaHandler) {
356: settings.setOrganization(value);
357: } else if (handlerStack.peek() instanceof RefinementHandler) {
358: ((Refinement) stack.peek()).setOrganization(value);
359: } else {
360: illegalElementError(name);
361: }
362: }
363: }
364:
365: private class OperatorHandler extends ElementHandler {
366:
367: public void endElement(String name) throws SAXException {
368: if (handlerStack.peek() instanceof MetaHandler) {
369: settings.setOperator(value);
370: } else if (handlerStack.peek() instanceof RefinementHandler) {
371: ((Refinement) stack.peek()).setOperator(value);
372: } else {
373: illegalElementError(name);
374: }
375: }
376: }
377:
378: private class AudienceHandler extends ElementHandler {
379:
380: public void endElement(String name) throws SAXException {
381: if (handlerStack.peek() instanceof MetaHandler) {
382: settings.setAudience(value);
383: } else if (handlerStack.peek() instanceof RefinementHandler) {
384: ((Refinement) stack.peek()).setAudience(value);
385: } else {
386: illegalElementError(name);
387: }
388: }
389: }
390:
391: private class DateHandler extends ElementHandler {
392:
393: public void endElement(String name) throws SAXException {
394: if (handlerStack.peek() instanceof MetaHandler) {
395: try {
396: settings.setLastSavedTime(ArchiveUtils
397: .parse14DigitDate(value));
398: } catch (ParseException e) {
399: throw new SAXException(e);
400: }
401: } else {
402: illegalElementError(name);
403: }
404: }
405: }
406:
407: // Refinement handlers
408: private class RefinementListHandler extends ElementHandler {
409:
410: public void startElement(String name) throws SAXException {
411: if (!(handlerStack.peek() instanceof RootHandler)) {
412: illegalElementError(name);
413: }
414: }
415: }
416:
417: private class RefinementHandler extends ElementHandler {
418: public void startElement(String name, Attributes atts)
419: throws SAXException {
420: stack
421: .push(new Refinement(
422: settings,
423: atts
424: .getValue(XMLSettingsHandler.XML_ELEMENT_REFERENCE)));
425: }
426: }
427:
428: private class ReferenceHandler extends ElementHandler {
429:
430: public void endElement(String name) throws SAXException {
431: if (handlerStack.peek() instanceof RefinementHandler) {
432: ((Refinement) stack.peek()).setReference(value);
433: } else {
434: illegalElementError(name);
435: }
436: }
437: }
438:
439: private class LimitsHandler extends ElementHandler {
440: }
441:
442: private class TimespanHandler extends ElementHandler {
443:
444: public void startElement(String name, Attributes atts)
445: throws SAXException {
446: if (stack.peek() instanceof Refinement) {
447: String from = atts
448: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_FROM);
449: String to = atts
450: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_TO);
451: try {
452: TimespanCriteria timespan = new TimespanCriteria(
453: from, to);
454: ((Refinement) stack.peek()).addCriteria(timespan);
455: } catch (ParseException e) {
456: throw new SAXException(e);
457: }
458: } else {
459: illegalElementError(name);
460: }
461: }
462: }
463:
464: private class PortnumberHandler extends ElementHandler {
465:
466: public void endElement(String name) throws SAXException {
467: if (handlerStack.peek() instanceof LimitsHandler) {
468: ((Refinement) stack.peek())
469: .addCriteria(new PortnumberCriteria(value));
470: } else {
471: illegalElementError(name);
472: }
473: }
474: }
475:
476: private class URIMatcherHandler extends ElementHandler {
477:
478: public void endElement(String name) throws SAXException {
479: if (handlerStack.peek() instanceof LimitsHandler) {
480: ((Refinement) stack.peek())
481: .addCriteria(new RegularExpressionCriteria(
482: value));
483: } else {
484: illegalElementError(name);
485: }
486: }
487: }
488:
489: // Handlers for objects and attributes
490: private class ModuleHandler extends ElementHandler {
491:
492: public void startElement(String name, Attributes atts)
493: throws SAXException {
494: ModuleType module;
495: if (name.equals(XMLSettingsHandler.XML_ELEMENT_CONTROLLER)) {
496: module = settingsHandler.getOrder();
497: } else {
498: module = settingsHandler
499: .getSettingsObject(null)
500: .getModule(
501: atts
502: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME));
503: }
504: stack.push(module);
505: }
506:
507: public void endElement(String name) throws SAXException {
508: stack.pop();
509: }
510: }
511:
512: private class NewModuleHandler extends ElementHandler {
513:
514: public void startElement(String name, Attributes atts)
515: throws SAXException {
516: ComplexType parentModule = (ComplexType) stack.peek();
517: String moduleName = atts
518: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
519: String moduleClass = atts
520: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_CLASS);
521: try {
522: ModuleType module = SettingsHandler
523: .instantiateModuleTypeFromClassName(moduleName,
524: moduleClass);
525: try {
526: parentModule.setAttribute(settings, module);
527: } catch (AttributeNotFoundException e) {
528: // Attribute was not found, but the complex type might
529: // be a MapType and then we are allowed to add new
530: // elements.
531: try {
532: parentModule.addElement(settings, module);
533: } catch (IllegalStateException ise) {
534: // An attribute in the settings file is not in the
535: // ComplexType's definition, log and skip.
536: logger.log(Level.WARNING, "Module '"
537: + moduleName + "' in '"
538: + locator.getSystemId() + "', line: "
539: + locator.getLineNumber()
540: + ", column: "
541: + locator.getColumnNumber()
542: + " is not defined in '"
543: + parentModule.getName() + "'.");
544: throw new SAXException(
545: new AttributeNotFoundException(ise
546: .getMessage()));
547: }
548: }
549: stack.push(module);
550: } catch (InvocationTargetException e) {
551: logger.log(Level.WARNING, "Couldn't instantiate "
552: + moduleName + ", from class: " + moduleClass
553: + "' in '" + locator.getSystemId()
554: + "', line: " + locator.getLineNumber()
555: + ", column: " + locator.getColumnNumber(), e);
556: throw new SAXException(e);
557: } catch (InvalidAttributeValueException e) {
558: throw new SAXException(e);
559: }
560: }
561:
562: public void endElement(String name) throws SAXException {
563: stack.pop();
564: }
565: }
566:
567: private class MapHandler extends ElementHandler {
568:
569: public void startElement(String name, Attributes atts)
570: throws SAXException {
571: String mapName = atts
572: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
573: ComplexType parentModule = (ComplexType) stack.peek();
574: try {
575: stack
576: .push(parentModule.getAttribute(settings,
577: mapName));
578: } catch (AttributeNotFoundException e) {
579: throw new SAXException(e);
580: }
581: }
582:
583: public void endElement(String name) throws SAXException {
584: stack.pop();
585: }
586: }
587:
588: private class SimpleElementHandler extends ElementHandler {
589:
590: public void startElement(String name, Attributes atts)
591: throws SAXException {
592: stack.push(atts
593: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME));
594: }
595:
596: public void endElement(String name) throws SAXException {
597: String elementName = (String) stack.pop();
598: Object container = stack.peek();
599: if (container instanceof ComplexType) {
600: try {
601: try {
602: ((ComplexType) container).setAttribute(
603: settings, new Attribute(elementName,
604: value));
605: } catch (AttributeNotFoundException e) {
606: // Attribute was not found, but the complex type might
607: // be a MapType and then we are allowed to add new
608: // elements.
609: try {
610: ((ComplexType) container).addElement(
611: settings, new SimpleType(
612: elementName, "", value));
613: } catch (IllegalStateException ise) {
614: logger.warning("Unknown attribute '"
615: + elementName + "' in '"
616: + locator.getSystemId()
617: + "', line: "
618: + locator.getLineNumber()
619: + ", column: "
620: + locator.getColumnNumber());
621: }
622: }
623: } catch (InvalidAttributeValueException e) {
624: try {
625: logger.warning("Illegal value '"
626: + value
627: + "' for attribute '"
628: + elementName
629: + "' in '"
630: + locator.getSystemId()
631: + "', line: "
632: + locator.getLineNumber()
633: + ", column: "
634: + locator.getColumnNumber()
635: + ", Value reset to default value: "
636: + ((ComplexType) container)
637: .getAttribute(settings,
638: elementName));
639: } catch (AttributeNotFoundException e1) {
640: throw new SAXException(e1);
641: }
642: }
643: } else {
644: if (container == null) {
645: // We can get here if an override is referring to a global
646: // filter since removed. Log it as severe; operator will
647: // probably want to know of all overrides with references
648: // to a global filter since removed.
649: logger
650: .severe("Empty container (Was a referenced parent"
651: + " filter removed?). Element details: elementName "
652: + elementName + ", name " + name);
653: } else {
654: ((ListType) container).add(value);
655: }
656: }
657: }
658: }
659:
660: private class ListHandler extends ElementHandler {
661:
662: public void startElement(String name, Attributes atts)
663: throws SAXException {
664: String listName = atts
665: .getValue(XMLSettingsHandler.XML_ATTRIBUTE_NAME);
666: ComplexType parentModule = (ComplexType) stack.peek();
667: ListType list;
668: try {
669: list = (ListType) parentModule.getAttribute(settings,
670: listName);
671: } catch (AttributeNotFoundException e) {
672: throw new SAXException(e);
673: }
674: list.clear();
675: stack.push(list);
676: }
677:
678: public void endElement(String name) throws SAXException {
679: stack.pop();
680: }
681: }
682:
683: /*
684: * (non-Javadoc)
685: *
686: * @see org.archive.crawler.settings.ValueErrorHandler#handleValueError(org.archive.crawler.settings.Constraint.FailedCheck)
687: */
688: public void handleValueError(FailedCheck error) {
689: logger.warning(error.getMessage() + "\n Attribute: '"
690: + error.getOwner().getName() + ":"
691: + error.getDefinition().getName() + "'\n Value: '"
692: + value + "'\n File: '" + locator.getSystemId()
693: + "', line: " + locator.getLineNumber() + ", column: "
694: + locator.getColumnNumber());
695: }
696: }
|