0001: package weka.classifiers.functions.supportVector;
0002:
0003: import weka.core.Attribute;
0004: import weka.core.Capabilities;
0005: import weka.core.Instance;
0006: import weka.core.Instances;
0007: import weka.core.Option;
0008: import weka.core.SelectedTag;
0009: import weka.core.Tag;
0010: import weka.core.TechnicalInformation;
0011: import weka.core.TechnicalInformationHandler;
0012: import weka.core.Utils;
0013: import weka.core.Capabilities.Capability;
0014: import weka.core.TechnicalInformation.Field;
0015: import weka.core.TechnicalInformation.Type;
0016:
0017: import java.util.Enumeration;
0018: import java.util.Vector;
0019:
0020: /**
0021: <!-- globalinfo-start -->
0022: * Implementation of the subsequence kernel (SSK) as described in [1] and of the subsequence kernel with lambda pruning (SSK-LP) as described in [2].<br/>
0023: * <br/>
0024: * For more information, see<br/>
0025: * <br/>
0026: * Huma Lodhi, Craig Saunders, John Shawe-Taylor, Nello Cristianini, Christopher J. C. H. Watkins (2002). Text Classification using String Kernels. Journal of Machine Learning Research. 2:419-444.<br/>
0027: * <br/>
0028: * F. Kleedorfer, A. Seewald (2005). Implementation of a String Kernel for WEKA. Wien, Austria.
0029: * <p/>
0030: <!-- globalinfo-end -->
0031: *
0032: <!-- technical-bibtex-start -->
0033: * BibTeX:
0034: * <pre>
0035: * @article{Lodhi2002,
0036: * author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Christopher J. C. H. Watkins},
0037: * journal = {Journal of Machine Learning Research},
0038: * pages = {419-444},
0039: * title = {Text Classification using String Kernels},
0040: * volume = {2},
0041: * year = {2002},
0042: * HTTP = {http://www.jmlr.org/papers/v2/lodhi02a.html}
0043: * }
0044: *
0045: * @techreport{Kleedorfer2005,
0046: * address = {Wien, Austria},
0047: * author = {F. Kleedorfer and A. Seewald},
0048: * institution = {Oesterreichisches Forschungsinstitut fuer Artificial Intelligence},
0049: * number = {TR-2005-13},
0050: * title = {Implementation of a String Kernel for WEKA},
0051: * year = {2005}
0052: * }
0053: * </pre>
0054: * <p/>
0055: <!-- technical-bibtex-end -->
0056: *
0057: <!-- options-start -->
0058: * Valid options are: <p/>
0059: *
0060: * <pre> -D
0061: * Enables debugging output (if available) to be printed.
0062: * (default: off)</pre>
0063: *
0064: * <pre> -no-checks
0065: * Turns off all checks - use with caution!
0066: * (default: checks on)</pre>
0067: *
0068: * <pre> -P <0|1>
0069: * The pruning method to use:
0070: * 0 = No pruning
0071: * 1 = Lambda pruning
0072: * (default: 0)</pre>
0073: *
0074: * <pre> -C <num>
0075: * The size of the cache (a prime number).
0076: * (default: 250007)</pre>
0077: *
0078: * <pre> -IC <num>
0079: * The size of the internal cache (a prime number).
0080: * (default: 200003)</pre>
0081: *
0082: * <pre> -L <num>
0083: * The lambda constant. Penalizes non-continuous subsequence
0084: * matches. Must be in (0,1).
0085: * (default: 0.5)</pre>
0086: *
0087: * <pre> -ssl <num>
0088: * The length of the subsequence.
0089: * (default: 3)</pre>
0090: *
0091: * <pre> -ssl-max <num>
0092: * The maximum length of the subsequence.
0093: * (default: 9)</pre>
0094: *
0095: * <pre> -N
0096: * Use normalization.
0097: * (default: no)</pre>
0098: *
0099: <!-- options-end -->
0100: *
0101: * <h1>Theory</h1>
0102: * <h2>Overview</h2>
0103: * The algorithm computes a measure of similarity between two texts based on
0104: * the number and form of their common subsequences, which need not be
0105: * contiguous. This method can be parametrized by specifying the subsequence
0106: * length k, the penalty factor lambda, which penalizes non-contiguous matches,
0107: * and optional 'lambda pruning', which takes maxLambdaExponent,
0108: * <code>m</code>, as parameter. Lambda pruning causes very 'stretched'
0109: * substring matches not to be counted, thus speeding up the computation. The
0110: * functionality of SSK and SSK-LP is explained in the following using simple
0111: * examples.
0112: *
0113: * <h2>Explanation & Examples</h2>
0114: * for all of the following examples, we assume these parameter values:
0115: *<pre>
0116: *k=2
0117: *lambda=0.5
0118: *m=8 (for SSK-LP examples)
0119: *</pre>
0120: *
0121: * <h3>SSK</h3>
0122: *
0123: * <h4>Example 1</h4>
0124: *
0125: * <pre>
0126: *SSK(2,"ab","axb")=0.5^5 = 0,03125
0127: *</pre>
0128: * There is one subsequence of the length of 2 that both strings have in
0129: * common, "ab". The result of SSK is computed by raising lambda to the power
0130: * of L, where L is the length of the subsequence match in the one string plus
0131: * the length of the subsequence match in the other, in our case:
0132: * <pre>
0133: * ab axb
0134: *L= 2 + 3 = 5
0135: * </pre>
0136: * hence, the kernel yields 0.5^5 = 0,03125
0137: *
0138: * <h4>Example 2</h4>
0139: * <pre>
0140: *SSK(2,"ab","abb")=0.5^5 + 0.5^4 = 0,09375
0141: *</pre>
0142: * Here, we also have one subsequence of the length of 2 that both strings have
0143: * in common, "ab". The result of SSK is actually computed by summing over all
0144: * values computed for each occurrence of a common subsequence match. In this
0145: * example, there are two possible cases:
0146: * <pre>
0147: *ab abb
0148: *-- -- L=4
0149: *-- - - L=5
0150: * </pre>
0151: * we have two matches, one of the length of 2+2=4, one of the length of 2+3=5,
0152: * so we get the result 0.5^5 + 0.5^4 = 0,09375.
0153: *
0154: * <h3>SSK-LP</h3>
0155: * Without lambda pruning, the string kernel finds *all* common subsequences of
0156: * the given length, whereas with lambda pruning, common subsequence matches
0157: * that are too much stretched in both strings are not taken into account. It
0158: * is argued that the value yielded for such a common subsequence is too low
0159: * (<code>lambda ^(length[match_in_s] + length[match_in_t]</code>) . Tests have
0160: * shown that a tremendous speedup can be achieved using this technique while
0161: * suffering from very little quality loss. <br>
0162: * Lambda pruning is parametrized by the maximum lambda exponent. It is a good
0163: * idea to choose that value to be about 3 or 4 times the subsequence length as
0164: * a rule of thumb. YMMV.
0165: *
0166: * <h4>Example 3</h4>
0167: * Without lambda pruning, one common subsequence,
0168: * "AB" would be found in the following two strings. (With k=2)
0169: * <pre>
0170: *SSK(2,"ab","axb")=0.5^14 = 0,00006103515625
0171: *</pre>
0172: * lambda pruning allows for the control of the match length. So, if m
0173: * (the maximum lambda exponent) is e.g. 8, these two strings would
0174: * yield a kernel value of 0:
0175: * <pre>
0176: *with lambda pruning: SSK-LP(2,8,"AxxxxxxxxxB","AyB")= 0
0177: *without lambda pruning: SSK(2,"AxxxxxxxxxB","AyB")= 0.5^14 = 0,00006103515625
0178: *</pre>
0179: * This is because the exponent for lambda (=the length of the subsequence
0180: * match) would be 14, which is > 8. In Contrast, the next result is
0181: * > 0
0182: *<pre>
0183: *m=8
0184: *SSK-LP(2,8,"AxxB","AyyB")=0.5^8 = 0,00390625
0185: *</pre>
0186: * because the lambda exponent would be 8, which is just accepted by lambda
0187: * pruning.
0188: *
0189: * <h3>Normalization</h3>
0190: * When the string kernel is used for its main purpose, as the kernel of a
0191: * support vector machine, it is not normalized. The normalized kernel can be
0192: * switched on by -F (feature space normalization) but is much slower. Like
0193: * most unnormalized kernels, K(x,x) is not a fixed value, see the next
0194: * example.
0195: *
0196: * <h4>Example 4</h4>
0197: *<pre>
0198: *SSK(2,"ab","ab")=0.5^4 = 0.0625
0199: *SSK(2,"AxxxxxxxxxB","AxxxxxxxxxB") = 12.761724710464478
0200: *</pre>
0201: * SSK is evaluated twice, each time for two identical strings. A good measure
0202: * of similarity would produce the same value in both cases, which should
0203: * indicate the same level of similarity. The value of the normalized SSK would
0204: * be 1.0 in both cases. So for the purpose of computing string similarity the
0205: * normalized kernel should be used. For SVM the unnormalized kernel is usually
0206: * sufficient.
0207: *
0208: * <h2>Complexity of SSK and SSK-LP</h2>
0209: * The time complexity of this method (without lambda pruning and with an
0210: * infinitely large cache) is<br>
0211: * <pre>O(k*|s|*|t|)</pre>
0212: * Lambda Pruning has a complexity (without caching) of<br>
0213: * <pre>O(m*binom(m,k)^2*(|s|+n)*|t|)</pre> <br>
0214: * <pre>
0215: *k... subsequence length (ssl)
0216: *s,t... strings
0217: *|s|... length of string s
0218: *binom(x,y)... binomial coefficient (x!/[(x-y)!y!])
0219: *m... maxLambdaExponent (ssl-max)
0220: *</pre>
0221: *
0222: * Keep in mind that execution time can increase fast for long strings
0223: * and big values for k, especially if you don't use lambda pruning.
0224: * With lambda pruning, computation is usually so fast that switching
0225: * on the cache leads to slower computation because of setup costs. Therefore
0226: * caching is switched off for lambda pruning.
0227: * <br>
0228: * <br>
0229: * For details and qualitative experiments about SSK, see [1] <br>
0230: * For details about lambda pruning and performance comparison of SSK
0231: * and SSK-LP (SSK with lambda pruning), see [2]
0232: * Note that the complexity estimation in [2] assumes no caching of
0233: * intermediate results, which has been implemented in the meantime and
0234: * greatly improves the speed of the SSK without lambda pruning.
0235: *<br>
0236: *
0237: *<h1>Notes for usage within Weka</h1>
0238: * Only instances of the following form can be processed using string kernels:
0239: * <pre>
0240: *+----------+-------------+---------------+
0241: *|attribute#| 0 | 1 |
0242: *+----------+-------------+---------------+
0243: *| content | [text data] | [class label] |
0244: *+----------------------------------------+
0245: * ... or ...
0246: *+----------+---------------+-------------+
0247: *|attribute#| 0 | 1 |
0248: *+----------+---------------+-------------+
0249: *| content | [class label] | [text data] |
0250: *+----------------------------------------+
0251: *</pre>
0252: *
0253: * @author Florian Kleedorfer (kleedorfer@austria.fm)
0254: * @author Alexander K. Seewald (alex@seewald.at)
0255: * @version $Revision: 1.5 $
0256: */
0257: public class StringKernel extends Kernel implements
0258: TechnicalInformationHandler {
0259:
0260: /** for serialization */
0261: private static final long serialVersionUID = -4902954211202690123L;
0262:
0263: /** The size of the cache (a prime number) */
0264: private int m_cacheSize = 250007;
0265:
0266: /** The size of the internal cache for intermediate results (a prime number) */
0267: private int m_internalCacheSize = 200003;
0268:
0269: /** The attribute number of the string attribute */
0270: private int m_strAttr;
0271:
0272: /** Kernel cache (i.e., cache for kernel evaluations) */
0273: private double[] m_storage;
0274: private long[] m_keys;
0275:
0276: /** Counts the number of kernel evaluations. */
0277: private int m_kernelEvals;
0278:
0279: /** The number of instance in the dataset */
0280: private int m_numInsts;
0281:
0282: /** Pruning method: No Pruning */
0283: public final static int PRUNING_NONE = 0;
0284: /** Pruning method: Lambda See [2] for details. */
0285: public final static int PRUNING_LAMBDA = 1;
0286: /** Pruning methods */
0287: public static final Tag[] TAGS_PRUNING = {
0288: new Tag(PRUNING_NONE, "No pruning"),
0289: new Tag(PRUNING_LAMBDA, "Lambda pruning"), };
0290:
0291: /** the pruning method */
0292: protected int m_PruningMethod = PRUNING_NONE;
0293:
0294: /** the decay factor that penalizes non-continuous substring matches. See [1]
0295: * for details. */
0296: protected double m_lambda = 0.5;
0297:
0298: /** The substring length */
0299: private int m_subsequenceLength = 3;
0300:
0301: /** The maximum substring length for lambda pruning */
0302: private int m_maxSubsequenceLength = 9;
0303:
0304: /** powers of lambda are prepared prior to kernel evaluations.
0305: * all powers between 0 and this value are precalculated */
0306: protected static final int MAX_POWER_OF_LAMBDA = 10000;
0307:
0308: /** the precalculated powers of lambda */
0309: protected double[] m_powersOflambda = null;
0310:
0311: /** flag for switching normalization on or off. This defaults to false and
0312: * can be turned on by the switch for feature space normalization in SMO
0313: */
0314: private boolean m_normalize = false;
0315:
0316: /** private cache for intermediate results */
0317: private int maxCache; // is set in unnormalizedKernel(s1,s2)
0318: private double[] cachekh;
0319: private int[] cachekhK;
0320: private double[] cachekh2;
0321: private int[] cachekh2K;
0322: /** cached indexes for private cache */
0323: private int m_multX;
0324: private int m_multY;
0325: private int m_multZ;
0326: private int m_multZZ;
0327:
0328: private boolean m_useRecursionCache = true;
0329:
0330: /**
0331: * default constructor
0332: */
0333: public StringKernel() {
0334: super ();
0335: }
0336:
0337: /**
0338: * creates a new StringKernel object. Initializes the kernel cache and the
0339: * 'lambda cache', i.e. the precalculated powers of lambda from lambda^2 to
0340: * lambda^MAX_POWER_OF_LAMBDA
0341: *
0342: * @param data the dataset to use
0343: * @param cacheSize the size of the cache
0344: * @param subsequenceLength the subsequence length
0345: * @param lambda the lambda value
0346: * @param debug whether to output debug information
0347: * @throws Exception if something goes wrong
0348: */
0349: public StringKernel(Instances data, int cacheSize,
0350: int subsequenceLength, double lambda, boolean debug)
0351: throws Exception {
0352:
0353: setDebug(debug);
0354: setCacheSize(cacheSize);
0355: setInternalCacheSize(200003);
0356: setSubsequenceLength(subsequenceLength);
0357: setMaxSubsequenceLength(-1);
0358: setLambda(lambda);
0359:
0360: buildKernel(data);
0361: }
0362:
0363: /**
0364: * Returns a string describing the kernel
0365: *
0366: * @return a description suitable for displaying in the
0367: * explorer/experimenter gui
0368: */
0369: public String globalInfo() {
0370: return "Implementation of the subsequence kernel (SSK) as described in [1] "
0371: + "and of the subsequence kernel with lambda pruning (SSK-LP) as "
0372: + "described in [2].\n\n"
0373: + "For more information, see\n\n"
0374: + getTechnicalInformation().toString();
0375: }
0376:
0377: /**
0378: * Returns an instance of a TechnicalInformation object, containing
0379: * detailed information about the technical background of this class,
0380: * e.g., paper reference or book this class is based on.
0381: *
0382: * @return the technical information about this class
0383: */
0384: public TechnicalInformation getTechnicalInformation() {
0385: TechnicalInformation result;
0386: TechnicalInformation additional;
0387:
0388: result = new TechnicalInformation(Type.ARTICLE);
0389: result
0390: .setValue(
0391: Field.AUTHOR,
0392: "Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Christopher J. C. H. Watkins");
0393: result.setValue(Field.YEAR, "2002");
0394: result.setValue(Field.TITLE,
0395: "Text Classification using String Kernels");
0396: result.setValue(Field.JOURNAL,
0397: "Journal of Machine Learning Research");
0398: result.setValue(Field.VOLUME, "2");
0399: result.setValue(Field.PAGES, "419-444");
0400: result.setValue(Field.HTTP,
0401: "http://www.jmlr.org/papers/v2/lodhi02a.html");
0402:
0403: additional = result.add(Type.TECHREPORT);
0404: additional.setValue(Field.AUTHOR,
0405: "F. Kleedorfer and A. Seewald");
0406: additional.setValue(Field.YEAR, "2005");
0407: additional.setValue(Field.TITLE,
0408: "Implementation of a String Kernel for WEKA");
0409: additional
0410: .setValue(Field.INSTITUTION,
0411: "Oesterreichisches Forschungsinstitut fuer Artificial Intelligence");
0412: additional.setValue(Field.ADDRESS, "Wien, Austria");
0413: additional.setValue(Field.NUMBER, "TR-2005-13");
0414:
0415: return result;
0416: }
0417:
0418: /**
0419: * Returns an enumeration describing the available options.
0420: *
0421: * @return an enumeration of all the available options.
0422: */
0423: public Enumeration listOptions() {
0424: Vector result;
0425: Enumeration en;
0426: String desc;
0427: String param;
0428: int i;
0429: SelectedTag tag;
0430:
0431: result = new Vector();
0432:
0433: en = super .listOptions();
0434: while (en.hasMoreElements())
0435: result.addElement(en.nextElement());
0436:
0437: desc = "";
0438: param = "";
0439: for (i = 0; i < TAGS_PRUNING.length; i++) {
0440: if (i > 0)
0441: param += "|";
0442: tag = new SelectedTag(TAGS_PRUNING[i].getID(), TAGS_PRUNING);
0443: param += "" + tag.getSelectedTag().getID();
0444: desc += "\t" + tag.getSelectedTag().getID() + " = "
0445: + tag.getSelectedTag().getReadable() + "\n";
0446: }
0447:
0448: result.addElement(new Option("\tThe pruning method to use:\n"
0449: + desc + "\t(default: " + PRUNING_NONE + ")", "P", 1,
0450: "-P <" + param + ">"));
0451:
0452: result.addElement(new Option(
0453: "\tThe size of the cache (a prime number).\n"
0454: + "\t(default: 250007)", "C", 1, "-C <num>"));
0455:
0456: result.addElement(new Option(
0457: "\tThe size of the internal cache (a prime number).\n"
0458: + "\t(default: 200003)", "IC", 1, "-IC <num>"));
0459:
0460: result.addElement(new Option(
0461: "\tThe lambda constant. Penalizes non-continuous subsequence\n"
0462: + "\tmatches. Must be in (0,1).\n"
0463: + "\t(default: 0.5)", "L", 1, "-L <num>"));
0464:
0465: result
0466: .addElement(new Option(
0467: "\tThe length of the subsequence.\n"
0468: + "\t(default: 3)", "ssl", 1,
0469: "-ssl <num>"));
0470:
0471: result.addElement(new Option(
0472: "\tThe maximum length of the subsequence.\n"
0473: + "\t(default: 9)", "ssl-max", 1,
0474: "-ssl-max <num>"));
0475:
0476: result.addElement(new Option("\tUse normalization.\n"
0477: + "\t(default: no)", "N", 0, "-N"));
0478:
0479: return result.elements();
0480: }
0481:
0482: /**
0483: * Parses a given list of options. <p/>
0484: *
0485: <!-- options-start -->
0486: * Valid options are: <p/>
0487: *
0488: * <pre> -D
0489: * Enables debugging output (if available) to be printed.
0490: * (default: off)</pre>
0491: *
0492: * <pre> -no-checks
0493: * Turns off all checks - use with caution!
0494: * (default: checks on)</pre>
0495: *
0496: * <pre> -P <0|1>
0497: * The pruning method to use:
0498: * 0 = No pruning
0499: * 1 = Lambda pruning
0500: * (default: 0)</pre>
0501: *
0502: * <pre> -C <num>
0503: * The size of the cache (a prime number).
0504: * (default: 250007)</pre>
0505: *
0506: * <pre> -IC <num>
0507: * The size of the internal cache (a prime number).
0508: * (default: 200003)</pre>
0509: *
0510: * <pre> -L <num>
0511: * The lambda constant. Penalizes non-continuous subsequence
0512: * matches. Must be in (0,1).
0513: * (default: 0.5)</pre>
0514: *
0515: * <pre> -ssl <num>
0516: * The length of the subsequence.
0517: * (default: 3)</pre>
0518: *
0519: * <pre> -ssl-max <num>
0520: * The maximum length of the subsequence.
0521: * (default: 9)</pre>
0522: *
0523: * <pre> -N
0524: * Use normalization.
0525: * (default: no)</pre>
0526: *
0527: <!-- options-end -->
0528: *
0529: * @param options the list of options as an array of strings
0530: * @throws Exception if an option is not supported
0531: */
0532: public void setOptions(String[] options) throws Exception {
0533: String tmpStr;
0534:
0535: tmpStr = Utils.getOption('P', options);
0536: if (tmpStr.length() != 0)
0537: setPruningMethod(new SelectedTag(Integer.parseInt(tmpStr),
0538: TAGS_PRUNING));
0539: else
0540: setPruningMethod(new SelectedTag(PRUNING_NONE, TAGS_PRUNING));
0541:
0542: tmpStr = Utils.getOption('C', options);
0543: if (tmpStr.length() != 0)
0544: setCacheSize(Integer.parseInt(tmpStr));
0545: else
0546: setCacheSize(250007);
0547:
0548: tmpStr = Utils.getOption("IC", options);
0549: if (tmpStr.length() != 0)
0550: setInternalCacheSize(Integer.parseInt(tmpStr));
0551: else
0552: setInternalCacheSize(200003);
0553:
0554: tmpStr = Utils.getOption('L', options);
0555: if (tmpStr.length() != 0)
0556: setLambda(Double.parseDouble(tmpStr));
0557: else
0558: setLambda(0.5);
0559:
0560: tmpStr = Utils.getOption("ssl", options);
0561: if (tmpStr.length() != 0)
0562: setSubsequenceLength(Integer.parseInt(tmpStr));
0563: else
0564: setSubsequenceLength(3);
0565:
0566: tmpStr = Utils.getOption("ssl-max", options);
0567: if (tmpStr.length() != 0)
0568: setMaxSubsequenceLength(Integer.parseInt(tmpStr));
0569: else
0570: setMaxSubsequenceLength(9);
0571:
0572: setUseNormalization(Utils.getFlag('N', options));
0573:
0574: if (getMaxSubsequenceLength() < 2 * getSubsequenceLength()) {
0575: throw new IllegalArgumentException(
0576: "Lambda Pruning forbids even contiguous substring matches! "
0577: + "Use a bigger value for ssl-max (at least 2*ssl).");
0578: }
0579:
0580: super .setOptions(options);
0581: }
0582:
0583: /**
0584: * Gets the current settings of the Kernel.
0585: *
0586: * @return an array of strings suitable for passing to setOptions
0587: */
0588: public String[] getOptions() {
0589: int i;
0590: Vector result;
0591: String[] options;
0592:
0593: result = new Vector();
0594: options = super .getOptions();
0595: for (i = 0; i < options.length; i++)
0596: result.add(options[i]);
0597:
0598: result.add("-P");
0599: result.add("" + m_PruningMethod);
0600:
0601: result.add("-C");
0602: result.add("" + getCacheSize());
0603:
0604: result.add("-IC");
0605: result.add("" + getInternalCacheSize());
0606:
0607: result.add("-L");
0608: result.add("" + getLambda());
0609:
0610: result.add("-ssl");
0611: result.add("" + getSubsequenceLength());
0612:
0613: result.add("-ssl-max");
0614: result.add("" + getMaxSubsequenceLength());
0615:
0616: if (getUseNormalization())
0617: result.add("-L");
0618:
0619: return (String[]) result.toArray(new String[result.size()]);
0620: }
0621:
0622: /**
0623: * Returns the tip text for this property
0624: *
0625: * @return tip text for this property suitable for
0626: * displaying in the explorer/experimenter gui
0627: */
0628: public String pruningMethodTipText() {
0629: return "The pruning method.";
0630: }
0631:
0632: /**
0633: * Sets the method used to for pruning.
0634: *
0635: * @param value the pruning method to use.
0636: */
0637: public void setPruningMethod(SelectedTag value) {
0638: if (value.getTags() == TAGS_PRUNING)
0639: m_PruningMethod = value.getSelectedTag().getID();
0640: }
0641:
0642: /**
0643: * Gets the method used for pruning.
0644: *
0645: * @return the pruning method to use.
0646: */
0647: public SelectedTag getPruningMethod() {
0648: return new SelectedTag(m_PruningMethod, TAGS_PRUNING);
0649: }
0650:
0651: /**
0652: * Sets the size of the cache to use (a prime number)
0653: *
0654: * @param value the size of the cache
0655: */
0656: public void setCacheSize(int value) {
0657: if (value >= 0) {
0658: m_cacheSize = value;
0659: clean();
0660: } else {
0661: System.out
0662: .println("Cache size cannot be smaller than 0 (provided: "
0663: + value + ")!");
0664: }
0665: }
0666:
0667: /**
0668: * Gets the size of the cache
0669: *
0670: * @return the cache size
0671: */
0672: public int getCacheSize() {
0673: return m_cacheSize;
0674: }
0675:
0676: /**
0677: * Returns the tip text for this property
0678: *
0679: * @return tip text for this property suitable for
0680: * displaying in the explorer/experimenter gui
0681: */
0682: public String cacheSizeTipText() {
0683: return "The size of the cache (a prime number).";
0684: }
0685:
0686: /**
0687: * sets the size of the internal cache for intermediate results. Memory
0688: * consumption is about 16x this amount in bytes. Only use when lambda
0689: * pruning is switched off.
0690: *
0691: * @param value the size of the internal cache
0692: */
0693: public void setInternalCacheSize(int value) {
0694: if (value >= 0) {
0695: m_internalCacheSize = value;
0696: clean();
0697: } else {
0698: System.out
0699: .println("Cache size cannot be smaller than 0 (provided: "
0700: + value + ")!");
0701: }
0702: }
0703:
0704: /**
0705: * Gets the size of the internal cache
0706: *
0707: * @return the cache size
0708: */
0709: public int getInternalCacheSize() {
0710: return m_internalCacheSize;
0711: }
0712:
0713: /**
0714: * Returns the tip text for this property
0715: *
0716: * @return tip text for this property suitable for
0717: * displaying in the explorer/experimenter gui
0718: */
0719: public String internalCacheSizeTipText() {
0720: return "The size of the internal cache (a prime number).";
0721: }
0722:
0723: /**
0724: * Sets the length of the subsequence.
0725: *
0726: * @param value the length
0727: */
0728: public void setSubsequenceLength(int value) {
0729: m_subsequenceLength = value;
0730: }
0731:
0732: /**
0733: * Returns the length of the subsequence
0734: *
0735: * @return the length
0736: */
0737: public int getSubsequenceLength() {
0738: return m_subsequenceLength;
0739: }
0740:
0741: /**
0742: * Returns the tip text for this property
0743: *
0744: * @return tip text for this property suitable for
0745: * displaying in the explorer/experimenter gui
0746: */
0747: public String subsequenceLengthTipText() {
0748: return "The subsequence length.";
0749: }
0750:
0751: /**
0752: * Sets the maximum length of the subsequence.
0753: *
0754: * @param value the maximum length
0755: */
0756: public void setMaxSubsequenceLength(int value) {
0757: m_maxSubsequenceLength = value;
0758: }
0759:
0760: /**
0761: * Returns the maximum length of the subsequence
0762: *
0763: * @return the maximum length
0764: */
0765: public int getMaxSubsequenceLength() {
0766: return m_maxSubsequenceLength;
0767: }
0768:
0769: /**
0770: * Returns the tip text for this property
0771: *
0772: * @return tip text for this property suitable for
0773: * displaying in the explorer/experimenter gui
0774: */
0775: public String maxSubsequenceLengthTipText() {
0776: return "The maximum subsequence length (theta in the paper)";
0777: }
0778:
0779: /**
0780: * Sets the lambda constant used in the string kernel
0781: *
0782: * @param value the lambda value to use
0783: */
0784: public void setLambda(double value) {
0785: m_lambda = value;
0786: }
0787:
0788: /**
0789: * Gets the lambda constant used in the string kernel
0790: *
0791: * @return the current lambda constant
0792: */
0793: public double getLambda() {
0794: return m_lambda;
0795: }
0796:
0797: /**
0798: * Returns the tip text for this property
0799: *
0800: * @return tip text for this property suitable for
0801: * displaying in the explorer/experimenter gui
0802: */
0803: public String lambdaTipText() {
0804: return "Penalizes non-continuous subsequence matches, from (0,1)";
0805: }
0806:
0807: /**
0808: * Sets whether to use normalization.
0809: * Each time this value is changed, the kernel cache is cleared.
0810: *
0811: * @param value whether to use normalization
0812: */
0813: public void setUseNormalization(boolean value) {
0814: if (value != m_normalize)
0815: clean();
0816:
0817: m_normalize = value;
0818: }
0819:
0820: /**
0821: * Returns whether normalization is used.
0822: *
0823: * @return true if normalization is used
0824: */
0825: public boolean getUseNormalization() {
0826: return m_normalize;
0827: }
0828:
0829: /**
0830: * Returns the tip text for this property
0831: *
0832: * @return tip text for this property suitable for
0833: * displaying in the explorer/experimenter gui
0834: */
0835: public String useNormalizationTipText() {
0836: return "Whether to use normalization.";
0837: }
0838:
0839: /**
0840: * Computes the result of the kernel function for two instances.
0841: * If id1 == -1, eval use inst1 instead of an instance in the dataset.
0842: *
0843: * @param id1 the index of the first instance in the dataset
0844: * @param id2 the index of the second instance in the dataset
0845: * @param inst1 the instance corresponding to id1 (used if id1 == -1)
0846: * @return the result of the kernel function
0847: * @throws Exception if something goes wrong
0848: */
0849: public double eval(int id1, int id2, Instance inst1)
0850: throws Exception {
0851: if (m_Debug && id1 > -1 && id2 > -1) {
0852: System.err.println("\nEvaluation of string kernel for");
0853: System.err.println(m_data.instance(id1).stringValue(
0854: m_strAttr));
0855: System.err.println("and");
0856: System.err.println(m_data.instance(id2).stringValue(
0857: m_strAttr));
0858: }
0859:
0860: //the normalized kernel returns 1 for comparison of
0861: //two identical strings
0862: if (id1 == id2 && m_normalize)
0863: return 1.0;
0864:
0865: double result = 0;
0866: long key = -1;
0867: int location = -1;
0868:
0869: // we can only cache if we know the indexes
0870: if ((id1 >= 0) && (m_keys != null)) {
0871: if (id1 > id2) {
0872: key = (long) id1 * m_numInsts + id2;
0873: } else {
0874: key = (long) id2 * m_numInsts + id1;
0875: }
0876: if (key < 0) {
0877: throw new Exception("Cache overflow detected!");
0878: }
0879: location = (int) (key % m_keys.length);
0880: if (m_keys[location] == (key + 1)) {
0881: if (m_Debug)
0882: System.err.println("result (cached): "
0883: + m_storage[location]);
0884: return m_storage[location];
0885: }
0886: }
0887:
0888: m_kernelEvals++;
0889: long start = System.currentTimeMillis();
0890:
0891: Instance inst2 = m_data.instance(id2);
0892: char[] s1 = inst1.stringValue(m_strAttr).toCharArray();
0893: char[] s2 = inst2.stringValue(m_strAttr).toCharArray();
0894:
0895: // prevent the kernel from returning NaN
0896: if (s1.length == 0 || s2.length == 0)
0897: return 0;
0898:
0899: if (m_normalize) {
0900: result = normalizedKernel(s1, s2);
0901: } else {
0902: result = unnormalizedKernel(s1, s2);
0903: }
0904:
0905: if (m_Debug) {
0906: long duration = System.currentTimeMillis() - start;
0907: System.err.println("result: " + result);
0908: System.err.println("evaluation time:" + duration + "\n");
0909: }
0910:
0911: // store result in cache
0912: if (key != -1) {
0913: m_storage[location] = result;
0914: m_keys[location] = (key + 1);
0915: }
0916: return result;
0917: }
0918:
0919: /**
0920: * Frees the memory used by the kernel.
0921: * (Useful with kernels which use cache.)
0922: * This function is called when the training is done.
0923: * i.e. after that, eval will be called with id1 == -1.
0924: */
0925: public void clean() {
0926: m_storage = null;
0927: m_keys = null;
0928: }
0929:
0930: /**
0931: * Returns the number of kernel evaluation performed.
0932: *
0933: * @return the number of kernel evaluation performed.
0934: */
0935: public int numEvals() {
0936: return m_kernelEvals;
0937: }
0938:
0939: /**
0940: * Returns the number of dot product cache hits.
0941: *
0942: * @return the number of dot product cache hits, or -1 if not supported by
0943: * this kernel.
0944: */
0945: public int numCacheHits() {
0946: // TODO: implement!
0947: return -1;
0948: }
0949:
0950: /**
0951: * evaluates the normalized kernel between s and t. See [1] for details about
0952: * the normalized SSK.
0953: *
0954: * @param s first input string
0955: * @param t second input string
0956: * @return a double indicating their distance, or similarity
0957: */
0958: public double normalizedKernel(char[] s, char[] t) {
0959: double k1 = unnormalizedKernel(s, s);
0960: double k2 = unnormalizedKernel(t, t);
0961: double normTerm = Math.sqrt(k1 * k2);
0962: return unnormalizedKernel(s, t) / normTerm;
0963: }
0964:
0965: /**
0966: * evaluates the unnormalized kernel between s and t. See [1] for details
0967: * about the unnormalized SSK.
0968: *
0969: * @param s first input string
0970: * @param t second input string
0971: * @return a double indicating their distance, or similarity
0972: */
0973: public double unnormalizedKernel(char[] s, char[] t) {
0974: if (t.length > s.length) {
0975: //swap because the algorithm is faster if s is
0976: //the longer string
0977: char[] buf = s;
0978: s = t;
0979: t = buf;
0980: }
0981: if (m_PruningMethod == PRUNING_NONE) {
0982: m_multX = (s.length + 1) * (t.length + 1);
0983: m_multY = (t.length + 1);
0984: m_multZ = 1;
0985: maxCache = m_internalCacheSize;
0986: if (maxCache == 0) {
0987: maxCache = (m_subsequenceLength + 1) * m_multX;
0988: } else if ((m_subsequenceLength + 1) * m_multX < maxCache) {
0989: maxCache = (m_subsequenceLength + 1) * m_multX;
0990: }
0991: m_useRecursionCache = true;
0992: cachekhK = new int[maxCache];
0993: cachekh2K = new int[maxCache];
0994: cachekh = new double[maxCache];
0995: cachekh2 = new double[maxCache];
0996: } else if (m_PruningMethod == PRUNING_LAMBDA) {
0997: maxCache = 0;
0998: m_useRecursionCache = false;
0999: }
1000:
1001: double res;
1002: if (m_PruningMethod == PRUNING_LAMBDA) {
1003: res = kernelLP(m_subsequenceLength, s, s.length - 1, t,
1004: t.length - 1, m_maxSubsequenceLength);
1005: } else {
1006: res = kernel(m_subsequenceLength, s, s.length - 1, t,
1007: t.length - 1);
1008: }
1009: cachekh = null;
1010: cachekhK = null;
1011: cachekh2 = null;
1012: cachekh2K = null;
1013:
1014: return res;
1015: }
1016:
1017: /**
1018: * Recursion-ending function that is called at the end of each
1019: * recursion branch.
1020: *
1021: * @param n
1022: * @return
1023: */
1024: protected double getReturnValue(int n) {
1025: if (n == 0)
1026: return 1;
1027: else
1028: return 0;
1029: }
1030:
1031: /**
1032: * the kernel function (Kn). This function performs the outer loop
1033: * character-wise over the first input string s. For each character
1034: * encountered, a recursion branch is started that identifies all
1035: * subsequences in t starting with that character. <br> See [1] for details
1036: * but note that this code is optimized and may be hard to recognize.
1037: *
1038: * @param n the current length of the matching subsequence
1039: * @param s first string, as a char array
1040: * @param t second string, as a char array
1041: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1042: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1043: * @return a double indicating the distance or similarity between s and t,
1044: * according to and depending on the initial value for n.
1045: */
1046: protected double kernel(int n, char[] s, int endIndexS, char[] t,
1047: int endIndexT) {
1048:
1049: //normal recursion ending case
1050: if (Math.min(endIndexS + 1, endIndexT + 1) < n)
1051: return getReturnValue(n);
1052:
1053: //accumulate all recursion results in one:
1054: double result = 0;
1055:
1056: //the tail-recursive function defined in [1] is turned into a
1057: //loop here, preventing stack overflows.
1058: //skim s from back to front
1059: for (int iS = endIndexS; iS > n - 2; iS--) {
1060: double buf = 0;
1061: //let the current character in s be x
1062: char x = s[iS];
1063: // iterate over all occurrences of x in t
1064: for (int j = 0; j <= endIndexT; j++) {
1065: if (t[j] == x) {
1066: //this is a match for the current character, hence
1067: //1. use previous chars in both strings (iS-1, j-1)
1068: //2. decrement the remainingMatchLength (n-1)
1069: //and start a recursion branch for these parameters
1070: buf += kernelHelper(n - 1, s, iS - 1, t, j - 1);
1071: }
1072: }
1073: //ok, all occurrences of x in t have been found
1074: //multiply the result with lambda^2
1075: // (one lambda for x, and the other for all matches of x in t)
1076: result += buf * m_powersOflambda[2];
1077: }
1078: return result;
1079: }
1080:
1081: /**
1082: * The kernel helper function, called K' in [1] and [2].
1083: *
1084: * @param n the current length of the matching subsequence
1085: * @param s first string, as a char array
1086: * @param t second string, as a char array
1087: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1088: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1089: * @return a partial result for K
1090: */
1091: protected double kernelHelper(int n, char[] s, int endIndexS,
1092: char[] t, int endIndexT) {
1093:
1094: //recursion ends if the current subsequence has maximal length,
1095: //which is the case here
1096: if (n <= 0) {
1097: return getReturnValue(n);
1098: }
1099:
1100: //recursion ends, too, if the current subsequence is shorter than
1101: //maximal length, but there is no chance that it will reach maximal length.
1102: //in this case, normally 0 is returned, but the EXPERIMENTAL
1103: //minSubsequenceLength feature allows shorter subsequence matches
1104: //also to contribute
1105: if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1106: return getReturnValue(n);
1107: }
1108: int adr = 0;
1109: if (m_useRecursionCache) {
1110: adr = m_multX * n + m_multY * endIndexS + m_multZ
1111: * endIndexT;
1112: if (cachekhK[adr % maxCache] == adr + 1)
1113: return cachekh[adr % maxCache];
1114: }
1115:
1116: //the tail-recursive function defined in [1] is turned into a
1117: //loop here, preventing stack overflows.
1118: //loop over s, nearly from the start (skip the first n-1 characters)
1119: //and only up until endIndexS, and recursively apply K''. Thus, every
1120: //character between n-1 and endIndexS in s is counted once as
1121: //being part of the subsequence match and once just as a gap.
1122: //In both cases lambda is multiplied with the result.
1123: double result = 0;
1124: /*
1125: for (int iS = n-1; iS <= endIndexS;iS++) {
1126: result *= m_lambda;
1127: result += kernelHelper2(n,s,iS, t, endIndexT);
1128: }
1129: if (m_useRecursionCache) {
1130: cachekhK[adr % maxCache]=adr+1; cachekh[adr % maxCache]=result;
1131: }
1132: return result;
1133: */
1134: /* ^^^ again, above code segment does not store some intermediate results... */
1135: result = m_lambda
1136: * kernelHelper(n, s, endIndexS - 1, t, endIndexT)
1137: + kernelHelper2(n, s, endIndexS, t, endIndexT);
1138: if (m_useRecursionCache) {
1139: cachekhK[adr % maxCache] = adr + 1;
1140: cachekh[adr % maxCache] = result;
1141: }
1142: return result;
1143: }
1144:
1145: /**
1146: * helper function for the evaluation of the kernel K'' see section
1147: * 'Efficient Computation of SSK' in [1]
1148: *
1149: * @param n the current length of the matching subsequence
1150: * @param s first string, as a char array
1151: * @param t second string, as a char array
1152: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1153: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1154: * @return a partial result for K'
1155: */
1156: protected double kernelHelper2(int n, char[] s, int endIndexS,
1157: char[] t, int endIndexT) {
1158:
1159: //recursion ends if one of the indices in both strings is <0
1160: if (endIndexS < 0 || endIndexT < 0) {
1161: return getReturnValue(n);
1162: }
1163:
1164: int adr = 0;
1165: if (m_useRecursionCache) {
1166: adr = m_multX * n + m_multY * endIndexS + m_multZ
1167: * endIndexT;
1168: if (cachekh2K[adr % maxCache] == adr + 1)
1169: return cachekh2[adr % maxCache];
1170: }
1171:
1172: //spot the last character in s, we'll need it
1173: char x = s[endIndexS];
1174:
1175: //recurse if the last characters of s and t, x (and y) are identical.
1176: //which is an easy case: just add up two recursions,
1177: // 1. one that counts x and y as a part of the subsequence match
1178: // -> n, endIndexS and endIndexT are decremented for next recursion level
1179: // -> lambda^2 is multiplied with the result to account for the length
1180: // of 2 that has been added to the length of the subsequence match
1181: // by accepting x and y.
1182: // 2. one that counts y as a gap in the match
1183: // -> only endIndexT is decremented for next recursion level
1184: // -> lambda is multiplied with the result to account for the length
1185: // of 1 that has been added to the length of the subsequence match
1186: // by omitting y.
1187: if (x == t[endIndexT]) {
1188: double ret = m_lambda
1189: * (kernelHelper2(n, s, endIndexS, t, endIndexT - 1) + m_lambda
1190: * kernelHelper(n - 1, s, endIndexS - 1, t,
1191: endIndexT - 1));
1192: if (m_useRecursionCache) {
1193: cachekh2K[adr % maxCache] = adr + 1;
1194: cachekh2[adr % maxCache] = ret;
1195: }
1196: return ret;
1197: } else {
1198: double ret = m_lambda
1199: * kernelHelper2(n, s, endIndexS, t, endIndexT - 1);
1200: if (m_useRecursionCache) {
1201: cachekh2K[adr % maxCache] = adr + 1;
1202: cachekh2[adr % maxCache] = ret;
1203: }
1204: return ret;
1205: }
1206:
1207: //look for x in t from back to front.
1208: //this is actually an optimization from [1] that spares unneccessary
1209: //recursions iff
1210: //x is actually found in t, but not at the last position.
1211: /*
1212: int i;
1213: int threshold = n>0?n-1:0;
1214: for (i=endIndexT-1; i >= threshold;i--) {
1215: if (x == t[i]) {
1216: double ret=getPowerOfLambda(endIndexT-i) * kernelHelper2(n,s,endIndexS, t, i);
1217: if (m_useRecursionCache) {
1218: cachekh2K[adr % maxCache]=adr+1; cachekh2[adr % maxCache]=ret;
1219: }
1220: return ret;
1221: }
1222: }
1223: */
1224: //end the recursion if x is not found in t.
1225: /* double ret = getReturnValue(n);
1226: if (m_useRecursionCache) {
1227: cachekh2K[adr % maxCache]=adr+1; cachekh2[adr % maxCache]=ret;
1228: }
1229: return ret;*/
1230: }
1231:
1232: /**
1233: * the kernel function K explained in [1] using lambda pruning, explained in
1234: * [2]. An additional parameter is introduced, which denotes the maximum
1235: * length of a subsequence match. This allows for the control of how relaxed
1236: * the subsequence matches are. <br>
1237: *
1238: * @param n the current length of the matching subsequence
1239: * @param s first string, as a char array
1240: * @param t second string, as a char array
1241: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1242: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1243: * @param remainingMatchLength actually the initial value for
1244: * maxLambdaExponent
1245: * @return a double indicating the distance or similarity between s and t,
1246: * according to and depending on the initial value for n.
1247: */
1248: protected double kernelLP(int n, char[] s, int endIndexS, char[] t,
1249: int endIndexT, int remainingMatchLength) {
1250: //see code docs in kernel()
1251: if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1252: return getReturnValue(n);
1253: }
1254: //lambda pruning check
1255: //stops recursion if the match is so long that the resulting
1256: //power of lambda is smaller than minLambda
1257: //if lambda pruning is not used, the remainingMatchLength is < 0
1258: //and this check never stops the recursion
1259: if (remainingMatchLength == 0)
1260: return getReturnValue(n);
1261: double result = 0;
1262: //see code docs in kernel()
1263: for (int iS = endIndexS; iS > n - 2; iS--) {
1264: double buf = 0;
1265: char x = s[iS];
1266: for (int j = 0; j <= endIndexT; j++) {
1267: if (t[j] == x) {
1268: //both t[j] and x are considered part of the subsequence match, hence
1269: //subtract 2 from the remainingMatchLength
1270: buf += kernelHelperLP(n - 1, s, iS - 1, t, j - 1,
1271: remainingMatchLength - 2);
1272: }
1273: }
1274: result += buf * m_powersOflambda[2];
1275: }
1276: return result;
1277: }
1278:
1279: /**
1280: * helper function for the evaluation of the kernel (K'n) using lambda pruning
1281: *
1282: * @param n the current length of the matching subsequence
1283: * @param s first string, as a char array
1284: * @param t second string, as a char array
1285: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1286: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1287: * @param remainingMatchLength the number of characters that may still be
1288: * used
1289: * for matching (i.e. gaps + matches in both strings)
1290: * @return a partial result for K
1291: */
1292: protected double kernelHelperLP(int n, char[] s, int endIndexS,
1293: char[] t, int endIndexT, int remainingMatchLength) {
1294:
1295: //see code docs in kernelHelper()
1296: if (n == 0) {
1297: return getReturnValue(n);
1298:
1299: }
1300: //see code docs in kernelHelper()
1301: if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1302: ;
1303: return getReturnValue(n);
1304: }
1305:
1306: //lambda pruning check
1307: //stops recursion if the match is so long that the resulting
1308: //power of lambda is smaller than minLambda
1309: //if lambda pruning is not used, the remainingMatchLength is < 0
1310: //and this check never stops the recursion
1311: if (remainingMatchLength < 2 * n) {
1312: return getReturnValue(n);
1313: }
1314: int adr = 0;
1315: if (m_useRecursionCache) {
1316: adr = m_multX * n + m_multY * endIndexS + m_multZ
1317: * endIndexT + m_multZZ * remainingMatchLength;
1318: if (cachekh2K[adr % maxCache] == adr + 1) {
1319: return cachekh2[adr % maxCache];
1320: }
1321: }
1322:
1323: int rml = 0; //counts the remaining match length
1324: double result = 0;
1325: //see code docs in kernelHelper()
1326: //difference to implementation in kernelHelper:
1327: //*)choose different starting point, which is found counting
1328: //the maximal remaining match length from endIndexS.
1329: //*)keep track of the remaining match length, rml, which is
1330: // incremented each loop
1331: for (int iS = (endIndexS - remainingMatchLength); iS <= endIndexS; iS++) {
1332: result *= m_lambda;
1333: result += kernelHelper2LP(n, s, iS, t, endIndexT, rml++);
1334: }
1335:
1336: if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1337: && n >= 0) {
1338: cachekhK[adr % maxCache] = adr + 1;
1339: cachekh[adr % maxCache] = result;
1340: }
1341: return result;
1342: }
1343:
1344: /**
1345: * helper function for the evaluation of the kernel (K''n) using lambda
1346: * pruning
1347: *
1348: * @param n the current length of the matching subsequence
1349: * @param s first string, as a char array
1350: * @param t second string, as a char array
1351: * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1352: * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1353: * @param remainingMatchLength the number of characters that may still be
1354: * used
1355: * for matching (i.e. gaps + matches in both strings)
1356: * @return a partial result for K'
1357: */
1358: protected double kernelHelper2LP(int n, char[] s, int endIndexS,
1359: char[] t, int endIndexT, int remainingMatchLength) {
1360:
1361: //lambda pruning check
1362: //stops recursion if the match is so long that the resulting
1363: //power of lambda is smaller than minLambda
1364: //if lambda pruning is not used, the remainingMatchLength is < 0
1365: //and this check never stops the recursion
1366: //if (remainingMatchLength <= 0) return 0;
1367: if (remainingMatchLength < 2 * n)
1368: return getReturnValue(n);
1369:
1370: //see code docs in kernelHelper2()
1371: if (endIndexS < 0 || endIndexT < 0)
1372: return getReturnValue(n);
1373: int adr = 0;
1374: if (m_useRecursionCache) {
1375: adr = m_multX * n + m_multY * endIndexS + m_multZ
1376: * endIndexT + m_multZZ * remainingMatchLength;
1377: if (cachekh2K[adr % maxCache] == adr + 1) {
1378: return cachekh2[adr % maxCache];
1379: }
1380: }
1381:
1382: char x = s[endIndexS];
1383: if (x == t[endIndexT]) {
1384: double ret = m_lambda
1385: * (kernelHelper2LP(n, s, endIndexS, t,
1386: endIndexT - 1, remainingMatchLength - 1) + m_lambda
1387: * kernelHelperLP(n - 1, s, endIndexS - 1,
1388: t, endIndexT - 1,
1389: remainingMatchLength - 2));
1390: if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1391: && n >= 0) {
1392: cachekh2K[adr % maxCache] = adr + 1;
1393: cachekh2[adr % maxCache] = ret;
1394: }
1395: return ret;
1396: }
1397:
1398: //see code docs in kernelHelper()
1399: //differences to implementation in kernelHelper():
1400: //*) choose a different ending point for the loop
1401: // based on the remaining match length
1402: int i;
1403: int minIndex = endIndexT - remainingMatchLength;
1404: if (minIndex < 0)
1405: minIndex = 0;
1406: for (i = endIndexT; i >= minIndex; i--) {
1407: if (x == t[i]) {
1408: int skipLength = endIndexT - i;
1409: double ret = getPowerOfLambda(skipLength)
1410: * kernelHelper2LP(n, s, endIndexS, t, i,
1411: remainingMatchLength - skipLength);
1412: if (m_useRecursionCache && endIndexS >= 0
1413: && endIndexT >= 0 && n >= 0) {
1414: cachekh2K[adr % maxCache] = adr + 1;
1415: cachekh2[adr % maxCache] = ret;
1416: }
1417: return ret;
1418: }
1419: }
1420: double ret = getReturnValue(n);
1421: if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1422: && n >= 0) {
1423: cachekh2K[adr % maxCache] = adr + 1;
1424: cachekh2[adr % maxCache] = ret;
1425: }
1426: return ret;
1427: }
1428:
1429: /**
1430: * precalculates small powers of lambda to speed up the kernel evaluation
1431: *
1432: * @return the powers
1433: */
1434: private double[] calculatePowersOfLambda() {
1435: double[] powers = new double[MAX_POWER_OF_LAMBDA + 1];
1436: powers[0] = 1.0;
1437: double val = 1.0;
1438: for (int i = 1; i <= MAX_POWER_OF_LAMBDA; i++) {
1439: val *= m_lambda;
1440: powers[i] = val;
1441: }
1442: return powers;
1443: }
1444:
1445: /**
1446: * retrieves a power of lambda from the lambda cache or calculates it
1447: * directly
1448: *
1449: * @param exponent the exponent to calculate
1450: * @return the exponent-th power of lambda
1451: */
1452: private double getPowerOfLambda(int exponent) {
1453: if (exponent > MAX_POWER_OF_LAMBDA)
1454: return Math.pow(m_lambda, exponent);
1455:
1456: if (exponent < 0)
1457: throw new IllegalArgumentException(
1458: "only positive powers of lambda may be computed");
1459:
1460: return m_powersOflambda[exponent];
1461: }
1462:
1463: /**
1464: * initializes variables etc.
1465: *
1466: * @param data the data to use
1467: */
1468: protected void initVars(Instances data) {
1469: super .initVars(data);
1470:
1471: m_kernelEvals = 0;
1472: // take the first string attribute
1473: m_strAttr = -1;
1474: for (int i = 0; i < data.numAttributes(); i++) {
1475: if (i == data.classIndex())
1476: continue;
1477: if (data.attribute(i).type() == Attribute.STRING) {
1478: m_strAttr = i;
1479: break;
1480: }
1481: }
1482: m_numInsts = m_data.numInstances();
1483: m_storage = new double[m_cacheSize];
1484: m_keys = new long[m_cacheSize];
1485: m_powersOflambda = calculatePowersOfLambda();
1486: }
1487:
1488: /**
1489: * Returns the Capabilities of this kernel.
1490: *
1491: * @return the capabilities of this object
1492: * @see Capabilities
1493: */
1494: public Capabilities getCapabilities() {
1495: Capabilities result = super .getCapabilities();
1496:
1497: result.enable(Capability.STRING_ATTRIBUTES);
1498: result.enableAllClasses();
1499:
1500: return result;
1501: }
1502:
1503: /**
1504: * builds the kernel with the given data.
1505: *
1506: * @param data the data to base the kernel on
1507: * @throws Exception if something goes wrong, e.g., the data does not
1508: * consist of one string attribute and the class
1509: */
1510: public void buildKernel(Instances data) throws Exception {
1511: super.buildKernel(data);
1512: }
1513: }
|