001: package it.unimi.dsi.mg4j.document;
002:
003: /*
004: * MG4J: Managing Gigabytes for Java
005: *
006: * Copyright (C) 2005-2007 Paolo Boldi
007: *
008: * This library is free software; you can redistribute it and/or modify it
009: * under the terms of the GNU Lesser General Public License as published by the Free
010: * Software Foundation; either version 2.1 of the License, or (at your option)
011: * any later version.
012: *
013: * This library is distributed in the hope that it will be useful, but
014: * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
015: * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
016: * for more details.
017: *
018: * You should have received a copy of the GNU Lesser General Public License
019: * along with this program; if not, write to the Free Software
020: * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
021: *
022: */
023: import it.unimi.dsi.fastutil.io.BinIO;
024: import it.unimi.dsi.fastutil.objects.Object2IntMap;
025: import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
026: import it.unimi.dsi.fastutil.objects.Object2ObjectLinkedOpenHashMap;
027: import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
028: import it.unimi.dsi.io.FastBufferedReader;
029: import it.unimi.dsi.io.NullReader;
030: import it.unimi.dsi.io.WordReader;
031: import it.unimi.dsi.util.Properties;
032:
033: import java.io.IOException;
034: import java.io.InputStream;
035: import java.io.Serializable;
036: import java.util.Iterator;
037: import java.util.Map;
038:
039: import org.apache.commons.configuration.ConfigurationException;
040:
041: /** A document factory that actually dispatches the task of building documents to various factories
042: * according to some strategy.
043: *
044: * <p>The strategy is specified as (an object embedding) a method that determines which factory
045: * should be used on the basis of the metadata that are provided to the {@link #getDocument(InputStream, Reference2ObjectMap)}
046: * method. Since usually the strategy will have to resolve the name of metadata, it is also passed
047: * this factory, so that the correct
048: * {@link it.unimi.dsi.mg4j.document.PropertyBasedDocumentFactory#resolve(Enum,Reference2ObjectMap)} method can be invoked.
049: *
050: * <p>Moreover, at construction one must specify, for each subfactory and for each field of this
051: * factory, which field of the subfactory should be used. Note that to guarantee sequential access,
052: * fields specified for each subfactory should appear in increasing order.
053: */
054: public class DispatchingDocumentFactory extends
055: PropertyBasedDocumentFactory {
056: private static final long serialVersionUID = 1L;
057:
058: private static final boolean DEBUG = false;
059:
060: /** Case-insensitive keys for metadata.
061: *
062: * @see PropertyBasedDocumentFactory.MetadataKeys
063: */
064: public static enum MetadataKeys {
065: /** The property containing the (comma-separated) sequence of field names. */
066: FIELDNAME,
067: /** The property containing the key that should be checked (e.g., mimetype). */
068: KEY,
069: /** The property containing comma-separated sequence of colon-separated pairs value/document factory names. */
070: RULE,
071: /** The property containing a comma-separated list with as many items as there are factories; each item will be
072: * a colon-separated list of as many integers as there are fields. The <var>k</var>-th integer in the <var>f</var>-th
073: * list is the number of the field of the <var>f</var>-th factory that should be used to extract field number <var>k</var>,
074: * or -1 if the field should be empty. */
075: MAP
076: }
077:
078: /** The value to be used in <code>RULE</code> to introduce the default factory. Otherwise, no default factory is
079: * provided for documents that do not match. */
080: public final static String OTHERWISE_IN_RULE = "?";
081:
082: /** A strategy that decides which factory is appropriate using the document metadata. */
083:
084: public static interface DispatchingStrategy extends Serializable {
085: /** Decides the index of the factory to be used for the given metadata, possibly using
086: * a factory to resolve property names.
087: *
088: * @param metadata the metadata of the document to be produced.
089: * @param factory the factory used to resolve metadata names.
090: * @return the factory index.
091: */
092: public int factoryNumber(
093: Reference2ObjectMap<Enum<?>, Object> metadata,
094: PropertyBasedDocumentFactory factory);
095: };
096:
097: /** A strategy that is based on trying to match the value of the metadata with a given key with respect to a
098: * certain set of values.
099: */
100: public static class StringBasedDispatchingStrategy implements
101: DispatchingStrategy {
102: private static final long serialVersionUID = 1L;
103: /** The key to be resolved. */
104: private final Enum<?> key;
105: /** The values that should be used for comparisons. */
106: private final Object2IntMap<String> value;
107:
108: /** The strategy works as follows: the property named <code>key</code> is resolved; if this property
109: * is not set, the default return value of <var>value</var> is returned.
110: * Otherwise, its value is compared, using the <code>equals</code>,
111: * method with the elements of the <code>value</code> set, and the corresponding integer is returned.
112: *
113: * @param key the key to be resolved.
114: * @param value the map of values.
115: */
116: public StringBasedDispatchingStrategy(final Enum<?> key,
117: final Object2IntMap<String> value) {
118: this .key = key;
119: this .value = value;
120: }
121:
122: public int factoryNumber(
123: final Reference2ObjectMap<Enum<?>, Object> metadata,
124: final PropertyBasedDocumentFactory factory) {
125: final Object val = factory.resolve(key, metadata);
126: if (DEBUG)
127: System.out.println("key " + key + " resolved using "
128: + metadata + " into " + val);
129: return value.getInt(val);
130: }
131:
132: };
133:
134: /** The number of subfactories used. */
135: private int n;
136: /** The subfactories used. */
137: private DocumentFactory[] documentFactory;
138: /** The number of fields of this factory. */
139: private int numberOfFields;
140: /** The names of the fields. */
141: private String[] fieldName;
142: /** The types of the fields. */
143: private FieldType[] fieldType;
144: /** The array specifying how subfactory fields should be mapped into fields of this factory. More precisely,
145: * <code>rename[f][k]</code> specifies which field of factory <code>documentFactory[f]</code> should be used
146: * to return the field named <code>fieldName[k]</code>: it is assumed that the type of the field in the subfactory
147: * is correct (i.e., that <code>documentFactory[f].fieldType(k)==fieldType[k]</code>). The value -1 is used to
148: * return an empty textual field (i.e., a word reader on an empty string).
149: */
150: private int[][] rename;
151: /** The strategy to be used. */
152: private DispatchingStrategy strategy;
153: /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the property key to be checked.
154: * Otherwise, this is <code>null</code>. */
155: private Enum<?> dispatchingKey;
156: /** If a {@link StringBasedDispatchingStrategy} should be used, this field represents the map from values to factories. */
157: private Object2ObjectLinkedOpenHashMap<String, Class<? extends DocumentFactory>> value2factoryClass;
158:
159: private void init(final DocumentFactory[] documentFactory,
160: final String[] fieldName, final FieldType[] fieldType,
161: final int[][] rename, final DispatchingStrategy strategy) {
162: n = documentFactory.length;
163: this .documentFactory = documentFactory;
164: numberOfFields = fieldName.length;
165: this .fieldName = fieldName;
166: this .fieldType = fieldType;
167: this .rename = rename;
168: this .strategy = strategy;
169: }
170:
171: // TODO: All IllegalArgumentException where ConfigurationException; check that now it's OK
172: private void checkAttributes() {
173: if (fieldName.length != fieldType.length
174: || rename.length != documentFactory.length
175: || documentFactory.length != n
176: || fieldName.length != numberOfFields)
177: throw new IllegalArgumentException(
178: "Length mismatch in defining the dispatching factory");
179: for (int f = 0; f < n; f++) {
180: if (rename[f].length != numberOfFields)
181: throw new IllegalArgumentException(
182: "The number of fields ("
183: + numberOfFields
184: + ") does not match the mapping rule for factory "
185: + documentFactory[f].getClass()
186: .getName());
187: for (int k = 0; k < numberOfFields; k++) {
188: if (rename[f][k] < -1
189: || rename[f][k] >= documentFactory[f]
190: .numberOfFields())
191: throw new IllegalArgumentException(rename[f][k]
192: + " is not a field of factory "
193: + documentFactory[f]);
194: if (rename[f][k] >= 0
195: && fieldType[k] != documentFactory[f]
196: .fieldType(rename[f][k]))
197: throw new IllegalArgumentException(
198: "Field "
199: + rename[f][k]
200: + " of factory "
201: + documentFactory[f]
202: + " has a type different from the type of the field it is mapped to");
203: }
204: }
205: if (n == 0 || numberOfFields == 0)
206: throw new IllegalArgumentException(
207: "Zero factories or fields specified");
208: if (strategy == null)
209: throw new IllegalArgumentException(
210: "No strategy was specified");
211: }
212:
213: private void setExtraArguments(final Object xtraPars)
214: throws IllegalArgumentException {
215: if (value2factoryClass == null)
216: throw new IllegalArgumentException(
217: "No "
218: + MetadataKeys.RULE
219: + " property was specified for the dispatching factory");
220: n = value2factoryClass.values().size();
221: documentFactory = new DocumentFactory[n];
222: Iterator<Class<? extends DocumentFactory>> it = value2factoryClass
223: .values().iterator();
224: for (int f = 0; f < n; f++) {
225: Class<? extends DocumentFactory> documentFactoryClass = it
226: .next();
227: try {
228: if (xtraPars == null)
229: documentFactory[f] = documentFactoryClass
230: .newInstance();
231: else
232: documentFactory[f] = documentFactoryClass
233: .getConstructor(xtraPars.getClass())
234: .newInstance(xtraPars);
235: } catch (Exception e) {
236: throw new IllegalArgumentException(e);
237: }
238: }
239:
240: fieldType = new FieldType[numberOfFields];
241: if (rename == null)
242: throw new IllegalArgumentException(
243: "No "
244: + MetadataKeys.MAP
245: + " property was specified for the dispatching factory");
246: for (int f = 0; f < n; f++) {
247: for (int k = 0; k < numberOfFields; k++) {
248: int kk = rename[f][k];
249: if (kk >= 0
250: && fieldType[k] != null
251: && fieldType[k] != documentFactory[f]
252: .fieldType(kk))
253: throw new IllegalArgumentException(
254: "Mismatch between field types for field "
255: + f
256: + ", relative to the remapping of factory "
257: + documentFactory[f].getClass()
258: .getName()
259: + " (the type used to be "
260: + fieldType[k]
261: + ", but now we want it to be "
262: + documentFactory[f].fieldType(kk)
263: + ")");
264: if (kk >= 0)
265: fieldType[k] = documentFactory[f].fieldType(kk);
266: }
267: }
268: for (int f = 0; f < numberOfFields; f++)
269: if (fieldType[f] == null)
270: throw new IllegalArgumentException(
271: "The type of field "
272: + fieldName[f]
273: + " could not be deduced, because it is never mapped to");
274: if (dispatchingKey == null)
275: throw new IllegalArgumentException(
276: "No "
277: + MetadataKeys.KEY
278: + " property was specified for the dispatching factory");
279: Object2IntMap<String> value2int = new Object2IntOpenHashMap<String>();
280: value2int.defaultReturnValue(-1);
281: for (Map.Entry<String, Class<? extends DocumentFactory>> e : value2factoryClass
282: .entrySet()) {
283: int k;
284: for (k = 0; k < n; k++)
285: if (e.getValue() == documentFactory[k].getClass()) {
286: if (e.getKey().equals(OTHERWISE_IN_RULE))
287: value2int.defaultReturnValue(k);
288: else
289: value2int.put(e.getKey(), k);
290: break;
291: }
292: if (k == n)
293: throw new IllegalArgumentException(
294: "Mismatch in the rule mapping " + e.getKey()
295: + " to " + e.getValue());
296: }
297: System.out.println("Building a strategy mapping "
298: + dispatchingKey + " to " + value2int);
299: strategy = new StringBasedDispatchingStrategy(dispatchingKey,
300: value2int);
301:
302: }
303:
304: /** Creates a new dispatching factory.
305: *
306: * @param documentFactory the array of subfactories.
307: * @param fieldName the names of this factory's fields.
308: * @param fieldType the types of this factory's fields.
309: * @param rename the way fields of this class are mapped to fields of the subfactories.
310: * @param strategy the strategy to decide which factory should be used.
311: */
312: public DispatchingDocumentFactory(
313: final DocumentFactory[] documentFactory,
314: final String[] fieldName, final FieldType[] fieldType,
315: final int[][] rename, final DispatchingStrategy strategy) {
316: init(documentFactory, fieldName, fieldType, rename, strategy);
317: checkAttributes();
318: }
319:
320: public DispatchingDocumentFactory copy() {
321: final DocumentFactory[] documentFactory = new DocumentFactory[this .documentFactory.length];
322: for (int i = documentFactory.length; i-- != 0;)
323: documentFactory[i] = this .documentFactory[i].copy();
324: return new DispatchingDocumentFactory(documentFactory,
325: fieldName, fieldType, rename, strategy);
326: }
327:
328: public DispatchingDocumentFactory(final Properties properties)
329: throws ConfigurationException {
330: super (properties);
331: setExtraArguments(properties);
332: checkAttributes();
333: }
334:
335: public DispatchingDocumentFactory(final String[] property)
336: throws ConfigurationException {
337: super (property);
338: setExtraArguments(property);
339: checkAttributes();
340: }
341:
342: public DispatchingDocumentFactory(
343: final Reference2ObjectMap<Enum<?>, Object> defaultMetadata) {
344: super (defaultMetadata);
345: checkAttributes(); // Will certainly fail because the configuration is actually missing
346: }
347:
348: public DispatchingDocumentFactory() {
349: super ();
350: checkAttributes(); // Will certainly fail because the configuration is actually missing
351: }
352:
353: @SuppressWarnings("unchecked")
354: @Override
355: protected boolean parseProperty(final String key,
356: final String[] values,
357: final Reference2ObjectMap<Enum<?>, Object> metadata)
358: throws ConfigurationException {
359: if (sameKey(MetadataKeys.FIELDNAME, key)) {
360: fieldName = values;
361: numberOfFields = fieldName.length;
362: return true;
363: } else if (sameKey(MetadataKeys.KEY, key)) {
364: final String dispatchingKeyName = ensureJustOne(key, values);
365: final int lastDot = dispatchingKeyName.lastIndexOf('.');
366: try {
367: dispatchingKey = Enum.valueOf((Class<Enum>) Class
368: .forName(dispatchingKeyName.substring(0,
369: lastDot)), dispatchingKeyName
370: .substring(lastDot + 1));
371: } catch (ClassNotFoundException e) {
372: throw new IllegalArgumentException(
373: "The class specified in the key "
374: + dispatchingKeyName
375: + " cannot be found");
376: }
377: return true;
378: } else if (sameKey(MetadataKeys.RULE, key)) {
379: String[] rules = values;
380: value2factoryClass = new Object2ObjectLinkedOpenHashMap<String, Class<? extends DocumentFactory>>();
381: int i, m = rules.length;
382: for (i = 0; i < m; i++) {
383: int pos = rules[i].indexOf(':');
384: if (pos <= 0 || pos == rules[i].length() - 1)
385: throw new ConfigurationException(
386: "Rule "
387: + rules[i]
388: + " does not contain a colon or it is malformed");
389: if (rules[i].indexOf(':', pos + 1) >= 0)
390: throw new ConfigurationException("Rule " + rules[i]
391: + " contains too many colons");
392: String factoryName = rules[i].substring(pos + 1);
393: Class<? extends DocumentFactory> factoryClass = null;
394: try {
395: factoryClass = (Class<? extends DocumentFactory>) Class
396: .forName(factoryName);
397: if (!(DocumentFactory.class
398: .isAssignableFrom(factoryClass)))
399: throw new ClassNotFoundException();
400: } catch (ClassNotFoundException e) {
401: throw new ConfigurationException(
402: "ParsingFactory "
403: + factoryName
404: + " is invalid; maybe the package name is missing");
405: }
406: value2factoryClass.put(rules[i].substring(0, pos),
407: factoryClass);
408: }
409: m = value2factoryClass.values().size();
410: return true;
411:
412: } else if (sameKey(MetadataKeys.MAP, key)) {
413: String[] pieces = values;
414: int i, m = pieces.length;
415: rename = new int[m][];
416: for (i = 0; i < m; i++) {
417: String[] subpieces = pieces[i].split(":");
418: if (i > 0 && subpieces.length != rename[0].length)
419: throw new ConfigurationException(
420: "Length mismatch in the map " + values);
421: rename[i] = new int[subpieces.length];
422: for (int k = 0; k < subpieces.length; k++) {
423: try {
424: rename[i][k] = Integer.parseInt(subpieces[k]);
425: } catch (NumberFormatException e) {
426: throw new ConfigurationException(
427: "Number format exception in the map "
428: + values);
429: }
430: }
431: }
432: }
433: return super .parseProperty(key, values, metadata);
434: }
435:
436: public int numberOfFields() {
437: return numberOfFields;
438: }
439:
440: public String fieldName(final int field) {
441: ensureFieldIndex(field);
442: return fieldName[field];
443: }
444:
445: public int fieldIndex(final String fieldName) {
446: for (int k = 0; k < numberOfFields; k++)
447: if (this .fieldName[k].equals(fieldName))
448: return k;
449: return -1;
450: }
451:
452: public FieldType fieldType(final int field) {
453: ensureFieldIndex(field);
454: return fieldType[field];
455: }
456:
457: /** A word reader that is returned when a null field should be returned. */
458: final private WordReader nullReader = new FastBufferedReader();
459:
460: public Document getDocument(final InputStream rawContent,
461: final Reference2ObjectMap<Enum<?>, Object> metadata)
462: throws IOException {
463:
464: final int factoryIndex = strategy.factoryNumber(metadata, this );
465: System.out.println("The strategy returned " + factoryIndex);
466: if (factoryIndex < 0 || factoryIndex >= n)
467: throw new IllegalArgumentException();
468:
469: System.out.println("Going to parse a document with " + metadata
470: + ", using "
471: + documentFactory[factoryIndex].getClass().getName());
472:
473: final DocumentFactory factory = documentFactory[factoryIndex];
474: final Document document = factory.getDocument(rawContent,
475: metadata);
476:
477: return new AbstractDocument() {
478: public CharSequence title() {
479: return document.title();
480: }
481:
482: public String toString() {
483: return document.toString();
484: }
485:
486: public CharSequence uri() {
487: return document.uri();
488: }
489:
490: public Object content(final int field) throws IOException {
491: ensureFieldIndex(field);
492: if (rename[factoryIndex][field] < 0)
493: return NullReader.getInstance();
494: return document.content(rename[factoryIndex][field]);
495: }
496:
497: public WordReader wordReader(final int field) {
498: ensureFieldIndex(field);
499: if (rename[factoryIndex][field] < 0)
500: return nullReader;
501: return document.wordReader(rename[factoryIndex][field]);
502: }
503:
504: public void close() throws IOException {
505: super .close();
506: document.close();
507: }
508: };
509:
510: }
511:
512: public static void main(final String[] arg) throws IOException,
513: ConfigurationException {
514: //PdfDocumentFactory pdfFactory = new PdfDocumentFactory();
515: //HtmlDocumentFactory htmlFactory = new HtmlDocumentFactory();
516: //IdentityDocumentFactory idFactory = new IdentityDocumentFactory();
517: //Object2IntMap map = new Object2IntOpenHashMap(
518: // new String[] { "application/pdf", "text/html" },
519: // new int[] { 0, 1 }
520: // );
521: //map.defaultReturnValue( 2 );
522: //DispatchingStrategy strategy = new StringBasedDispatchingStrategy( MetadataKeys.MIMETYPE, map );
523:
524: Properties p = new Properties();
525: p.addProperty(MetadataKeys.FIELDNAME.name().toLowerCase(),
526: "text,title");
527: p.addProperty(MetadataKeys.KEY.name().toLowerCase(),
528: PropertyBasedDocumentFactory.MetadataKeys.MIMETYPE
529: .name());
530: p
531: .addProperty(
532: MetadataKeys.RULE.name().toLowerCase(),
533: "application/pdf:it.unimi.dsi.mg4j.document.PdfDocumentFactory,text/html:it.unimi.dsi.mg4j.document.HtmlDocumentFactory,?:it.unimi.dsi.mg4j.document.IdentityDocumentFactory");
534: p.addProperty(MetadataKeys.MAP.name().toLowerCase(),
535: "0:-1,0:1,0:-1");
536: p.addProperty(MetadataKeys.MAP.name().toLowerCase(),
537: "0:-1,0:1,0:-1");
538: p.addProperty(MetadataKeys.MAP.name().toLowerCase(),
539: "0:-1,0:1,0:-1");
540: p.addProperty(
541: PropertyBasedDocumentFactory.MetadataKeys.ENCODING
542: .name().toLowerCase(), "iso-8859-1");
543:
544: DispatchingDocumentFactory factory = new DispatchingDocumentFactory(
545: p);
546: DocumentCollection dc = new FileSetDocumentCollection(arg,
547: factory);
548: BinIO.storeObject(dc, "test.collection");
549: }
550: }
|