0001: /*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * Filter.java
0019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.filters;
0024:
0025: import weka.core.Capabilities;
0026: import weka.core.CapabilitiesHandler;
0027: import weka.core.Instance;
0028: import weka.core.Instances;
0029: import weka.core.Option;
0030: import weka.core.OptionHandler;
0031: import weka.core.Queue;
0032: import weka.core.RelationalLocator;
0033: import weka.core.SerializedObject;
0034: import weka.core.StringLocator;
0035: import weka.core.Utils;
0036: import weka.core.Capabilities.Capability;
0037: import weka.core.converters.ConverterUtils.DataSource;
0038:
0039: import java.io.FileOutputStream;
0040: import java.io.PrintWriter;
0041: import java.io.Serializable;
0042: import java.util.Enumeration;
0043: import java.util.Iterator;
0044:
0045: /**
0046: * An abstract class for instance filters: objects that take instances
0047: * as input, carry out some transformation on the instance and then
0048: * output the instance. The method implementations in this class
0049: * assume that most of the work will be done in the methods overridden
0050: * by subclasses.<p>
0051: *
0052: * A simple example of filter use. This example doesn't remove
0053: * instances from the output queue until all instances have been
0054: * input, so has higher memory consumption than an approach that
0055: * uses output instances as they are made available:<p>
0056: *
0057: * <code> <pre>
0058: * Filter filter = ..some type of filter..
0059: * Instances instances = ..some instances..
0060: * for (int i = 0; i < data.numInstances(); i++) {
0061: * filter.input(data.instance(i));
0062: * }
0063: * filter.batchFinished();
0064: * Instances newData = filter.outputFormat();
0065: * Instance processed;
0066: * while ((processed = filter.output()) != null) {
0067: * newData.add(processed);
0068: * }
0069: * ..do something with newData..
0070: * </pre> </code>
0071: *
0072: * @author Len Trigg (trigg@cs.waikato.ac.nz)
0073: * @version $Revision: 1.38 $
0074: */
0075: public abstract class Filter implements Serializable,
0076: CapabilitiesHandler {
0077:
0078: /** for serialization */
0079: private static final long serialVersionUID = -8835063755891851218L;
0080:
0081: /** The output format for instances */
0082: private Instances m_OutputFormat = null;
0083:
0084: /** The output instance queue */
0085: private Queue m_OutputQueue = null;
0086:
0087: /** Indices of string attributes in the output format */
0088: protected StringLocator m_OutputStringAtts = null;
0089:
0090: /** Indices of string attributes in the input format */
0091: protected StringLocator m_InputStringAtts = null;
0092:
0093: /** Indices of relational attributes in the output format */
0094: protected RelationalLocator m_OutputRelAtts = null;
0095:
0096: /** Indices of relational attributes in the input format */
0097: protected RelationalLocator m_InputRelAtts = null;
0098:
0099: /** The input format for instances */
0100: private Instances m_InputFormat = null;
0101:
0102: /** Record whether the filter is at the start of a batch */
0103: protected boolean m_NewBatch = true;
0104:
0105: /** True if the first batch has been done */
0106: protected boolean m_FirstBatchDone = false;
0107:
0108: /**
0109: * Returns true if the a new batch was started, either a new instance of the
0110: * filter was created or the batchFinished() method got called.
0111: *
0112: * @return true if a new batch has been initiated
0113: * @see #m_NewBatch
0114: * @see #batchFinished()
0115: */
0116: public boolean isNewBatch() {
0117: return m_NewBatch;
0118: }
0119:
0120: /**
0121: * Returns true if the first batch of instances got processed. Necessary for
0122: * supervised filters, which "learn" from the first batch and then shouldn't
0123: * get updated with subsequent calls of batchFinished().
0124: *
0125: * @return true if the first batch has been processed
0126: * @see #m_FirstBatchDone
0127: * @see #batchFinished()
0128: */
0129: public boolean isFirstBatchDone() {
0130: return m_FirstBatchDone;
0131: }
0132:
0133: /**
0134: * Returns the Capabilities of this filter. Derived filters have to
0135: * override this method to enable capabilities.
0136: *
0137: * @return the capabilities of this object
0138: * @see Capabilities
0139: */
0140: public Capabilities getCapabilities() {
0141: Capabilities result;
0142:
0143: result = new Capabilities(this );
0144: result.setMinimumNumberInstances(0);
0145:
0146: return result;
0147: }
0148:
0149: /**
0150: * Returns the Capabilities of this filter, customized based on the data.
0151: * I.e., if removes all class capabilities, in case there's not class
0152: * attribute present or removes the NO_CLASS capability, in case that
0153: * there's a class present.
0154: *
0155: * @param data the data to use for customization
0156: * @return the capabilities of this object, based on the data
0157: * @see #getCapabilities()
0158: */
0159: public Capabilities getCapabilities(Instances data) {
0160: Capabilities result;
0161: Capabilities classes;
0162: Iterator iter;
0163: Capability cap;
0164:
0165: result = getCapabilities();
0166:
0167: // no class? -> remove all class capabilites apart from NO_CLASS
0168: if (data.classIndex() == -1) {
0169: classes = result.getClassCapabilities();
0170: iter = classes.capabilities();
0171: while (iter.hasNext()) {
0172: cap = (Capability) iter.next();
0173: if (cap != Capability.NO_CLASS) {
0174: result.disable(cap);
0175: result.disableDependency(cap);
0176: }
0177: }
0178: }
0179: // class? -> remove NO_CLASS
0180: else {
0181: result.disable(Capability.NO_CLASS);
0182: result.disableDependency(Capability.NO_CLASS);
0183: }
0184:
0185: return result;
0186: }
0187:
0188: /**
0189: * Sets the format of output instances. The derived class should use this
0190: * method once it has determined the outputformat. The
0191: * output queue is cleared.
0192: *
0193: * @param outputFormat the new output format
0194: */
0195: protected void setOutputFormat(Instances outputFormat) {
0196:
0197: if (outputFormat != null) {
0198: m_OutputFormat = outputFormat.stringFreeStructure();
0199: initOutputLocators(m_OutputFormat, null);
0200:
0201: // Rename the relation
0202: String relationName = outputFormat.relationName() + "-"
0203: + this .getClass().getName();
0204: if (this instanceof OptionHandler) {
0205: String[] options = ((OptionHandler) this ).getOptions();
0206: for (int i = 0; i < options.length; i++) {
0207: relationName += options[i].trim();
0208: }
0209: }
0210: m_OutputFormat.setRelationName(relationName);
0211: } else {
0212: m_OutputFormat = null;
0213: }
0214: m_OutputQueue = new Queue();
0215: }
0216:
0217: /**
0218: * Gets the currently set inputformat instances. This dataset may contain
0219: * buffered instances.
0220: *
0221: * @return the input Instances.
0222: */
0223: protected Instances getInputFormat() {
0224:
0225: return m_InputFormat;
0226: }
0227:
0228: /**
0229: * Returns a reference to the current input format without
0230: * copying it.
0231: *
0232: * @return a reference to the current input format
0233: */
0234: protected Instances inputFormatPeek() {
0235:
0236: return m_InputFormat;
0237: }
0238:
0239: /**
0240: * Returns a reference to the current output format without
0241: * copying it.
0242: *
0243: * @return a reference to the current output format
0244: */
0245: protected Instances outputFormatPeek() {
0246:
0247: return m_OutputFormat;
0248: }
0249:
0250: /**
0251: * Adds an output instance to the queue. The derived class should use this
0252: * method for each output instance it makes available.
0253: *
0254: * @param instance the instance to be added to the queue.
0255: */
0256: protected void push(Instance instance) {
0257:
0258: if (instance != null) {
0259: if (instance.dataset() != null)
0260: copyValues(instance, false);
0261: instance.setDataset(m_OutputFormat);
0262: m_OutputQueue.push(instance);
0263: }
0264: }
0265:
0266: /**
0267: * Clears the output queue.
0268: */
0269: protected void resetQueue() {
0270:
0271: m_OutputQueue = new Queue();
0272: }
0273:
0274: /**
0275: * Adds the supplied input instance to the inputformat dataset for
0276: * later processing. Use this method rather than
0277: * getInputFormat().add(instance). Or else. Note that the provided
0278: * instance gets copied when buffered.
0279: *
0280: * @param instance the <code>Instance</code> to buffer.
0281: */
0282: protected void bufferInput(Instance instance) {
0283:
0284: if (instance != null) {
0285: copyValues(instance, true);
0286: m_InputFormat.add(instance);
0287: }
0288: }
0289:
0290: /**
0291: * Initializes the input attribute locators. If indices is null then all
0292: * attributes of the data will be considered, otherwise only the ones
0293: * that were provided.
0294: *
0295: * @param data the data to initialize the locators with
0296: * @param indices if not null, the indices to which to restrict
0297: * the locating
0298: */
0299: protected void initInputLocators(Instances data, int[] indices) {
0300: if (indices == null) {
0301: m_InputStringAtts = new StringLocator(data);
0302: m_InputRelAtts = new RelationalLocator(data);
0303: } else {
0304: m_InputStringAtts = new StringLocator(data, indices);
0305: m_InputRelAtts = new RelationalLocator(data, indices);
0306: }
0307: }
0308:
0309: /**
0310: * Initializes the output attribute locators. If indices is null then all
0311: * attributes of the data will be considered, otherwise only the ones
0312: * that were provided.
0313: *
0314: * @param data the data to initialize the locators with
0315: * @param indices if not null, the indices to which to restrict
0316: * the locating
0317: */
0318: protected void initOutputLocators(Instances data, int[] indices) {
0319: if (indices == null) {
0320: m_OutputStringAtts = new StringLocator(data);
0321: m_OutputRelAtts = new RelationalLocator(data);
0322: } else {
0323: m_OutputStringAtts = new StringLocator(data, indices);
0324: m_OutputRelAtts = new RelationalLocator(data, indices);
0325: }
0326: }
0327:
0328: /**
0329: * Copies string/relational values contained in the instance copied to a new
0330: * dataset. The Instance must already be assigned to a dataset. This
0331: * dataset and the destination dataset must have the same structure.
0332: *
0333: * @param instance the Instance containing the string/relational
0334: * values to copy.
0335: * @param isInput if true the input format and input attribute
0336: * locators are used otherwise the output format
0337: * and output locators
0338: */
0339: protected void copyValues(Instance instance, boolean isInput) {
0340:
0341: RelationalLocator.copyRelationalValues(instance,
0342: (isInput) ? m_InputFormat : m_OutputFormat,
0343: (isInput) ? m_InputRelAtts : m_OutputRelAtts);
0344:
0345: StringLocator.copyStringValues(instance,
0346: (isInput) ? m_InputFormat : m_OutputFormat,
0347: (isInput) ? m_InputStringAtts : m_OutputStringAtts);
0348: }
0349:
0350: /**
0351: * Takes string/relational values referenced by an Instance and copies them
0352: * from a source dataset to a destination dataset. The instance references are
0353: * updated to be valid for the destination dataset. The instance may have the
0354: * structure (i.e. number and attribute position) of either dataset (this
0355: * affects where references are obtained from). Only works if the number
0356: * of string/relational attributes is the same in both indices (implicitly
0357: * these string/relational attributes should be semantically same but just
0358: * with shifted positions).
0359: *
0360: * @param instance the instance containing references to strings/
0361: * relational values in the source dataset that
0362: * will have references updated to be valid for
0363: * the destination dataset.
0364: * @param instSrcCompat true if the instance structure is the same as
0365: * the source, or false if it is the same as the
0366: * destination (i.e. which of the string/relational
0367: * attribute indices contains the correct locations
0368: * for this instance).
0369: * @param srcDataset the dataset for which the current instance
0370: * string/relational value references are valid
0371: * (after any position mapping if needed)
0372: * @param destDataset the dataset for which the current instance
0373: * string/relational value references need to be
0374: * inserted (after any position mapping if needed)
0375: */
0376: protected void copyValues(Instance instance, boolean instSrcCompat,
0377: Instances srcDataset, Instances destDataset) {
0378:
0379: RelationalLocator.copyRelationalValues(instance, instSrcCompat,
0380: srcDataset, m_InputRelAtts, destDataset,
0381: m_OutputRelAtts);
0382:
0383: StringLocator.copyStringValues(instance, instSrcCompat,
0384: srcDataset, m_InputStringAtts, getOutputFormat(),
0385: m_OutputStringAtts);
0386: }
0387:
0388: /**
0389: * This will remove all buffered instances from the inputformat dataset.
0390: * Use this method rather than getInputFormat().delete();
0391: */
0392: protected void flushInput() {
0393:
0394: if ((m_InputStringAtts.getAttributeIndices().length > 0)
0395: || (m_InputRelAtts.getAttributeIndices().length > 0)) {
0396: m_InputFormat = m_InputFormat.stringFreeStructure();
0397: } else {
0398: // This more efficient than new Instances(m_InputFormat, 0);
0399: m_InputFormat.delete();
0400: }
0401: }
0402:
0403: /**
0404: * tests the data whether the filter can actually handle it
0405: *
0406: * @param instanceInfo the data to test
0407: * @throws Exception if the test fails
0408: */
0409: protected void testInputFormat(Instances instanceInfo)
0410: throws Exception {
0411: getCapabilities(instanceInfo).testWithFail(instanceInfo);
0412: }
0413:
0414: /**
0415: * Sets the format of the input instances. If the filter is able to
0416: * determine the output format before seeing any input instances, it
0417: * does so here. This default implementation clears the output format
0418: * and output queue, and the new batch flag is set. Overriders should
0419: * call <code>super.setInputFormat(Instances)</code>
0420: *
0421: * @param instanceInfo an Instances object containing the input instance
0422: * structure (any instances contained in the object are ignored - only the
0423: * structure is required).
0424: * @return true if the outputFormat may be collected immediately
0425: * @throws Exception if the inputFormat can't be set successfully
0426: */
0427: public boolean setInputFormat(Instances instanceInfo)
0428: throws Exception {
0429:
0430: testInputFormat(instanceInfo);
0431:
0432: m_InputFormat = instanceInfo.stringFreeStructure();
0433: m_OutputFormat = null;
0434: m_OutputQueue = new Queue();
0435: m_NewBatch = true;
0436: m_FirstBatchDone = false;
0437: initInputLocators(m_InputFormat, null);
0438: return false;
0439: }
0440:
0441: /**
0442: * Gets the format of the output instances. This should only be called
0443: * after input() or batchFinished() has returned true. The relation
0444: * name of the output instances should be changed to reflect the
0445: * action of the filter (eg: add the filter name and options).
0446: *
0447: * @return an Instances object containing the output instance
0448: * structure only.
0449: * @throws NullPointerException if no input structure has been
0450: * defined (or the output format hasn't been determined yet)
0451: */
0452: public Instances getOutputFormat() {
0453:
0454: if (m_OutputFormat == null) {
0455: throw new NullPointerException("No output format defined.");
0456: }
0457: return new Instances(m_OutputFormat, 0);
0458: }
0459:
0460: /**
0461: * Input an instance for filtering. Ordinarily the instance is
0462: * processed and made available for output immediately. Some filters
0463: * require all instances be read before producing output, in which
0464: * case output instances should be collected after calling
0465: * batchFinished(). If the input marks the start of a new batch, the
0466: * output queue is cleared. This default implementation assumes all
0467: * instance conversion will occur when batchFinished() is called.
0468: *
0469: * @param instance the input instance
0470: * @return true if the filtered instance may now be
0471: * collected with output().
0472: * @throws NullPointerException if the input format has not been
0473: * defined.
0474: * @throws Exception if the input instance was not of the correct
0475: * format or if there was a problem with the filtering.
0476: */
0477: public boolean input(Instance instance) throws Exception {
0478:
0479: if (m_InputFormat == null) {
0480: throw new NullPointerException(
0481: "No input instance format defined");
0482: }
0483: if (m_NewBatch) {
0484: m_OutputQueue = new Queue();
0485: m_NewBatch = false;
0486: }
0487: bufferInput(instance);
0488: return false;
0489: }
0490:
0491: /**
0492: * Signify that this batch of input to the filter is finished. If
0493: * the filter requires all instances prior to filtering, output()
0494: * may now be called to retrieve the filtered instances. Any
0495: * subsequent instances filtered should be filtered based on setting
0496: * obtained from the first batch (unless the inputFormat has been
0497: * re-assigned or new options have been set). This default
0498: * implementation assumes all instance processing occurs during
0499: * inputFormat() and input().
0500: *
0501: * @return true if there are instances pending output
0502: * @throws NullPointerException if no input structure has been defined,
0503: * @throws Exception if there was a problem finishing the batch.
0504: */
0505: public boolean batchFinished() throws Exception {
0506:
0507: if (m_InputFormat == null) {
0508: throw new NullPointerException(
0509: "No input instance format defined");
0510: }
0511: flushInput();
0512: m_NewBatch = true;
0513: m_FirstBatchDone = true;
0514: return (numPendingOutput() != 0);
0515: }
0516:
0517: /**
0518: * Output an instance after filtering and remove from the output queue.
0519: *
0520: * @return the instance that has most recently been filtered (or null if
0521: * the queue is empty).
0522: * @throws NullPointerException if no output structure has been defined
0523: */
0524: public Instance output() {
0525:
0526: if (m_OutputFormat == null) {
0527: throw new NullPointerException(
0528: "No output instance format defined");
0529: }
0530: if (m_OutputQueue.empty()) {
0531: return null;
0532: }
0533: Instance result = (Instance) m_OutputQueue.pop();
0534: // Clear out references to old strings/relationals occasionally
0535: if (m_OutputQueue.empty() && m_NewBatch) {
0536: if ((m_OutputStringAtts.getAttributeIndices().length > 0)
0537: || (m_OutputRelAtts.getAttributeIndices().length > 0)) {
0538: m_OutputFormat = m_OutputFormat.stringFreeStructure();
0539: }
0540: }
0541: return result;
0542: }
0543:
0544: /**
0545: * Output an instance after filtering but do not remove from the
0546: * output queue.
0547: *
0548: * @return the instance that has most recently been filtered (or null if
0549: * the queue is empty).
0550: * @throws NullPointerException if no input structure has been defined
0551: */
0552: public Instance outputPeek() {
0553:
0554: if (m_OutputFormat == null) {
0555: throw new NullPointerException(
0556: "No output instance format defined");
0557: }
0558: if (m_OutputQueue.empty()) {
0559: return null;
0560: }
0561: Instance result = (Instance) m_OutputQueue.peek();
0562: return result;
0563: }
0564:
0565: /**
0566: * Returns the number of instances pending output
0567: *
0568: * @return the number of instances pending output
0569: * @throws NullPointerException if no input structure has been defined
0570: */
0571: public int numPendingOutput() {
0572:
0573: if (m_OutputFormat == null) {
0574: throw new NullPointerException(
0575: "No output instance format defined");
0576: }
0577: return m_OutputQueue.size();
0578: }
0579:
0580: /**
0581: * Returns whether the output format is ready to be collected
0582: *
0583: * @return true if the output format is set
0584: */
0585: public boolean isOutputFormatDefined() {
0586:
0587: return (m_OutputFormat != null);
0588: }
0589:
0590: /**
0591: * Creates a deep copy of the given filter using serialization.
0592: *
0593: * @param model the filter to copy
0594: * @return a deep copy of the filter
0595: * @throws Exception if an error occurs
0596: */
0597: public static Filter makeCopy(Filter model) throws Exception {
0598: return (Filter) new SerializedObject(model).getObject();
0599: }
0600:
0601: /**
0602: * Creates a given number of deep copies of the given filter using
0603: * serialization.
0604: *
0605: * @param model the filter to copy
0606: * @param num the number of filter copies to create.
0607: * @return an array of filters.
0608: * @throws Exception if an error occurs
0609: */
0610: public static Filter[] makeCopies(Filter model, int num)
0611: throws Exception {
0612:
0613: if (model == null) {
0614: throw new Exception("No model filter set");
0615: }
0616: Filter[] filters = new Filter[num];
0617: SerializedObject so = new SerializedObject(model);
0618: for (int i = 0; i < filters.length; i++) {
0619: filters[i] = (Filter) so.getObject();
0620: }
0621: return filters;
0622: }
0623:
0624: /**
0625: * Filters an entire set of instances through a filter and returns
0626: * the new set.
0627: *
0628: * @param data the data to be filtered
0629: * @param filter the filter to be used
0630: * @return the filtered set of data
0631: * @throws Exception if the filter can't be used successfully
0632: */
0633: public static Instances useFilter(Instances data, Filter filter)
0634: throws Exception {
0635: /*
0636: System.err.println(filter.getClass().getName()
0637: + " in:" + data.numInstances());
0638: */
0639: for (int i = 0; i < data.numInstances(); i++) {
0640: filter.input(data.instance(i));
0641: }
0642: filter.batchFinished();
0643: Instances newData = filter.getOutputFormat();
0644: Instance processed;
0645: while ((processed = filter.output()) != null) {
0646: newData.add(processed);
0647: }
0648:
0649: /*
0650: System.err.println(filter.getClass().getName()
0651: + " out:" + newData.numInstances());
0652: */
0653: return newData;
0654: }
0655:
0656: /**
0657: * Method for testing filters.
0658: *
0659: * @param filter the filter to use
0660: * @param options should contain the following arguments: <br>
0661: * -i input_file <br>
0662: * -o output_file <br>
0663: * -c class_index <br>
0664: * or -h for help on options
0665: * @throws Exception if something goes wrong or the user requests help on
0666: * command options
0667: */
0668: public static void filterFile(Filter filter, String[] options)
0669: throws Exception {
0670:
0671: boolean debug = false;
0672: Instances data = null;
0673: DataSource input = null;
0674: PrintWriter output = null;
0675: boolean helpRequest;
0676:
0677: try {
0678: helpRequest = Utils.getFlag('h', options);
0679:
0680: if (Utils.getFlag('d', options)) {
0681: debug = true;
0682: }
0683: String infileName = Utils.getOption('i', options);
0684: String outfileName = Utils.getOption('o', options);
0685: String classIndex = Utils.getOption('c', options);
0686:
0687: if (filter instanceof OptionHandler) {
0688: ((OptionHandler) filter).setOptions(options);
0689: }
0690:
0691: Utils.checkForRemainingOptions(options);
0692: if (helpRequest) {
0693: throw new Exception("Help requested.\n");
0694: }
0695: if (infileName.length() != 0) {
0696: input = new DataSource(infileName);
0697: } else {
0698: input = new DataSource(System.in);
0699: }
0700: if (outfileName.length() != 0) {
0701: output = new PrintWriter(new FileOutputStream(
0702: outfileName));
0703: } else {
0704: output = new PrintWriter(System.out);
0705: }
0706:
0707: data = input.getStructure();
0708: if (classIndex.length() != 0) {
0709: if (classIndex.equals("first")) {
0710: data.setClassIndex(0);
0711: } else if (classIndex.equals("last")) {
0712: data.setClassIndex(data.numAttributes() - 1);
0713: } else {
0714: data
0715: .setClassIndex(Integer.parseInt(classIndex) - 1);
0716: }
0717: }
0718: } catch (Exception ex) {
0719: String filterOptions = "";
0720: // Output the error and also the valid options
0721: if (filter instanceof OptionHandler) {
0722: filterOptions += "\nFilter options:\n\n";
0723: Enumeration enu = ((OptionHandler) filter)
0724: .listOptions();
0725: while (enu.hasMoreElements()) {
0726: Option option = (Option) enu.nextElement();
0727: filterOptions += option.synopsis() + '\n'
0728: + option.description() + "\n";
0729: }
0730: }
0731:
0732: String genericOptions = "\nGeneral options:\n\n"
0733: + "-h\n"
0734: + "\tGet help on available options.\n"
0735: + "\t(use -b -h for help on batch mode.)\n"
0736: + "-i <file>\n"
0737: + "\tThe name of the file containing input instances.\n"
0738: + "\tIf not supplied then instances will be read from stdin.\n"
0739: + "-o <file>\n"
0740: + "\tThe name of the file output instances will be written to.\n"
0741: + "\tIf not supplied then instances will be written to stdout.\n"
0742: + "-c <class index>\n"
0743: + "\tThe number of the attribute to use as the class.\n"
0744: + "\t\"first\" and \"last\" are also valid entries.\n"
0745: + "\tIf not supplied then no class is assigned.\n";
0746:
0747: throw new Exception('\n' + ex.getMessage() + filterOptions
0748: + genericOptions);
0749: }
0750:
0751: if (debug) {
0752: System.err.println("Setting input format");
0753: }
0754: boolean printedHeader = false;
0755: if (filter.setInputFormat(data)) {
0756: if (debug) {
0757: System.err.println("Getting output format");
0758: }
0759: output.println(filter.getOutputFormat().toString());
0760: printedHeader = true;
0761: }
0762:
0763: // Pass all the instances to the filter
0764: Instance inst;
0765: while (input.hasMoreElements(data)) {
0766: inst = input.nextElement(data);
0767: if (debug) {
0768: System.err.println("Input instance to filter");
0769: }
0770: if (filter.input(inst)) {
0771: if (debug) {
0772: System.err
0773: .println("Filter said collect immediately");
0774: }
0775: if (!printedHeader) {
0776: throw new Error(
0777: "Filter didn't return true from setInputFormat() "
0778: + "earlier!");
0779: }
0780: if (debug) {
0781: System.err.println("Getting output instance");
0782: }
0783: output.println(filter.output().toString());
0784: }
0785: }
0786:
0787: // Say that input has finished, and print any pending output instances
0788: if (debug) {
0789: System.err.println("Setting end of batch");
0790: }
0791: if (filter.batchFinished()) {
0792: if (debug) {
0793: System.err.println("Filter said collect output");
0794: }
0795: if (!printedHeader) {
0796: if (debug) {
0797: System.err.println("Getting output format");
0798: }
0799: output.println(filter.getOutputFormat().toString());
0800: }
0801: if (debug) {
0802: System.err.println("Getting output instance");
0803: }
0804: while (filter.numPendingOutput() > 0) {
0805: output.println(filter.output().toString());
0806: if (debug) {
0807: System.err.println("Getting output instance");
0808: }
0809: }
0810: }
0811: if (debug) {
0812: System.err.println("Done");
0813: }
0814:
0815: if (output != null) {
0816: output.close();
0817: }
0818: }
0819:
0820: /**
0821: * Method for testing filters ability to process multiple batches.
0822: *
0823: * @param filter the filter to use
0824: * @param options should contain the following arguments:<br>
0825: * -i (first) input file <br>
0826: * -o (first) output file <br>
0827: * -r (second) input file <br>
0828: * -s (second) output file <br>
0829: * -c class_index <br>
0830: * or -h for help on options
0831: * @throws Exception if something goes wrong or the user requests help on
0832: * command options
0833: */
0834: public static void batchFilterFile(Filter filter, String[] options)
0835: throws Exception {
0836:
0837: Instances firstData = null;
0838: Instances secondData = null;
0839: DataSource firstInput = null;
0840: DataSource secondInput = null;
0841: PrintWriter firstOutput = null;
0842: PrintWriter secondOutput = null;
0843: boolean helpRequest;
0844: try {
0845: helpRequest = Utils.getFlag('h', options);
0846:
0847: String fileName = Utils.getOption('i', options);
0848: if (fileName.length() != 0) {
0849: firstInput = new DataSource(fileName);
0850: } else {
0851: throw new Exception("No first input file given.\n");
0852: }
0853:
0854: fileName = Utils.getOption('r', options);
0855: if (fileName.length() != 0) {
0856: secondInput = new DataSource(fileName);
0857: } else {
0858: throw new Exception("No second input file given.\n");
0859: }
0860:
0861: fileName = Utils.getOption('o', options);
0862: if (fileName.length() != 0) {
0863: firstOutput = new PrintWriter(new FileOutputStream(
0864: fileName));
0865: } else {
0866: firstOutput = new PrintWriter(System.out);
0867: }
0868:
0869: fileName = Utils.getOption('s', options);
0870: if (fileName.length() != 0) {
0871: secondOutput = new PrintWriter(new FileOutputStream(
0872: fileName));
0873: } else {
0874: secondOutput = new PrintWriter(System.out);
0875: }
0876: String classIndex = Utils.getOption('c', options);
0877:
0878: if (filter instanceof OptionHandler) {
0879: ((OptionHandler) filter).setOptions(options);
0880: }
0881: Utils.checkForRemainingOptions(options);
0882:
0883: if (helpRequest) {
0884: throw new Exception("Help requested.\n");
0885: }
0886: firstData = firstInput.getStructure();
0887: secondData = secondInput.getStructure();
0888: if (!secondData.equalHeaders(firstData)) {
0889: throw new Exception("Input file formats differ.\n");
0890: }
0891: if (classIndex.length() != 0) {
0892: if (classIndex.equals("first")) {
0893: firstData.setClassIndex(0);
0894: secondData.setClassIndex(0);
0895: } else if (classIndex.equals("last")) {
0896: firstData
0897: .setClassIndex(firstData.numAttributes() - 1);
0898: secondData
0899: .setClassIndex(secondData.numAttributes() - 1);
0900: } else {
0901: firstData.setClassIndex(Integer
0902: .parseInt(classIndex) - 1);
0903: secondData.setClassIndex(Integer
0904: .parseInt(classIndex) - 1);
0905: }
0906: }
0907: } catch (Exception ex) {
0908: String filterOptions = "";
0909: // Output the error and also the valid options
0910: if (filter instanceof OptionHandler) {
0911: filterOptions += "\nFilter options:\n\n";
0912: Enumeration enu = ((OptionHandler) filter)
0913: .listOptions();
0914: while (enu.hasMoreElements()) {
0915: Option option = (Option) enu.nextElement();
0916: filterOptions += option.synopsis() + '\n'
0917: + option.description() + "\n";
0918: }
0919: }
0920:
0921: String genericOptions = "\nGeneral options:\n\n"
0922: + "-h\n"
0923: + "\tGet help on available options.\n"
0924: + "-i <filename>\n"
0925: + "\tThe file containing first input instances.\n"
0926: + "-o <filename>\n"
0927: + "\tThe file first output instances will be written to.\n"
0928: + "-r <filename>\n"
0929: + "\tThe file containing second input instances.\n"
0930: + "-s <filename>\n"
0931: + "\tThe file second output instances will be written to.\n"
0932: + "-c <class index>\n"
0933: + "\tThe number of the attribute to use as the class.\n"
0934: + "\t\"first\" and \"last\" are also valid entries.\n"
0935: + "\tIf not supplied then no class is assigned.\n";
0936:
0937: throw new Exception('\n' + ex.getMessage() + filterOptions
0938: + genericOptions);
0939: }
0940: boolean printedHeader = false;
0941: if (filter.setInputFormat(firstData)) {
0942: firstOutput.println(filter.getOutputFormat().toString());
0943: printedHeader = true;
0944: }
0945:
0946: // Pass all the instances to the filter
0947: Instance inst;
0948: while (firstInput.hasMoreElements(firstData)) {
0949: inst = firstInput.nextElement(firstData);
0950: if (filter.input(inst)) {
0951: if (!printedHeader) {
0952: throw new Error(
0953: "Filter didn't return true from setInputFormat() "
0954: + "earlier!");
0955: }
0956: firstOutput.println(filter.output().toString());
0957: }
0958: }
0959:
0960: // Say that input has finished, and print any pending output instances
0961: if (filter.batchFinished()) {
0962: if (!printedHeader) {
0963: firstOutput
0964: .println(filter.getOutputFormat().toString());
0965: }
0966: while (filter.numPendingOutput() > 0) {
0967: firstOutput.println(filter.output().toString());
0968: }
0969: }
0970:
0971: if (firstOutput != null) {
0972: firstOutput.close();
0973: }
0974: printedHeader = false;
0975: if (filter.isOutputFormatDefined()) {
0976: secondOutput.println(filter.getOutputFormat().toString());
0977: printedHeader = true;
0978: }
0979: // Pass all the second instances to the filter
0980: while (secondInput.hasMoreElements(secondData)) {
0981: inst = secondInput.nextElement(secondData);
0982: if (filter.input(inst)) {
0983: if (!printedHeader) {
0984: throw new Error("Filter didn't return true from"
0985: + " isOutputFormatDefined() earlier!");
0986: }
0987: secondOutput.println(filter.output().toString());
0988: }
0989: }
0990:
0991: // Say that input has finished, and print any pending output instances
0992: if (filter.batchFinished()) {
0993: if (!printedHeader) {
0994: secondOutput.println(filter.getOutputFormat()
0995: .toString());
0996: }
0997: while (filter.numPendingOutput() > 0) {
0998: secondOutput.println(filter.output().toString());
0999: }
1000: }
1001: if (secondOutput != null) {
1002: secondOutput.close();
1003: }
1004: }
1005:
1006: /**
1007: * runs the filter instance with the given options.
1008: *
1009: * @param filter the filter to run
1010: * @param options the commandline options
1011: */
1012: protected static void runFilter(Filter filter, String[] options) {
1013: try {
1014: if (Utils.getFlag('b', options)) {
1015: Filter.batchFilterFile(filter, options);
1016: } else {
1017: Filter.filterFile(filter, options);
1018: }
1019: } catch (Exception e) {
1020: if ((e.toString().indexOf("Help requested") == -1)
1021: && (e.toString().indexOf("Filter options") == -1))
1022: e.printStackTrace();
1023: else
1024: System.err.println(e.getMessage());
1025: }
1026: }
1027:
1028: /**
1029: * Main method for testing this class.
1030: *
1031: * @param args should contain arguments to the filter: use -h for help
1032: */
1033: public static void main(String[] args) {
1034:
1035: try {
1036: if (args.length == 0) {
1037: throw new Exception(
1038: "First argument must be the class name of a Filter");
1039: }
1040: String fname = args[0];
1041: Filter f = (Filter) Class.forName(fname).newInstance();
1042: args[0] = "";
1043: runFilter(f, args);
1044: } catch (Exception ex) {
1045: ex.printStackTrace();
1046: System.err.println(ex.getMessage());
1047: }
1048: }
1049: }
|