Source Code Cross Referenced for StringKernel.java in » Science » weka » weka » classifiers » functions » supportVector » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Science » weka » weka.classifiers.functions.supportVector
Source Cross Referenced Class Diagram Java Document (Java Doc)
0001:        package weka.classifiers.functions.supportVector;
0002:
0003:        import weka.core.Attribute;
0004:        import weka.core.Capabilities;
0005:        import weka.core.Instance;
0006:        import weka.core.Instances;
0007:        import weka.core.Option;
0008:        import weka.core.SelectedTag;
0009:        import weka.core.Tag;
0010:        import weka.core.TechnicalInformation;
0011:        import weka.core.TechnicalInformationHandler;
0012:        import weka.core.Utils;
0013:        import weka.core.Capabilities.Capability;
0014:        import weka.core.TechnicalInformation.Field;
0015:        import weka.core.TechnicalInformation.Type;
0016:
0017:        import java.util.Enumeration;
0018:        import java.util.Vector;
0019:
0020:        /**
0021:         <!-- globalinfo-start -->
0022:         * Implementation of the subsequence kernel (SSK) as described in [1] and of the subsequence kernel with lambda pruning (SSK-LP) as described in [2].<br/>
0023:         * <br/>
0024:         * For more information, see<br/>
0025:         * <br/>
0026:         * Huma Lodhi, Craig Saunders, John Shawe-Taylor, Nello Cristianini, Christopher J. C. H. Watkins (2002). Text Classification using String Kernels. Journal of Machine Learning Research. 2:419-444.<br/>
0027:         * <br/>
0028:         * F. Kleedorfer, A. Seewald (2005). Implementation of a String Kernel for WEKA. Wien, Austria.
0029:         * <p/>
0030:         <!-- globalinfo-end -->
0031:         *
0032:         <!-- technical-bibtex-start -->
0033:         * BibTeX:
0034:         * <pre>
0035:         * &#64;article{Lodhi2002,
0036:         *    author = {Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Christopher J. C. H. Watkins},
0037:         *    journal = {Journal of Machine Learning Research},
0038:         *    pages = {419-444},
0039:         *    title = {Text Classification using String Kernels},
0040:         *    volume = {2},
0041:         *    year = {2002},
0042:         *    HTTP = {http://www.jmlr.org/papers/v2/lodhi02a.html}
0043:         * }
0044:         * 
0045:         * &#64;techreport{Kleedorfer2005,
0046:         *    address = {Wien, Austria},
0047:         *    author = {F. Kleedorfer and A. Seewald},
0048:         *    institution = {Oesterreichisches Forschungsinstitut fuer Artificial Intelligence},
0049:         *    number = {TR-2005-13},
0050:         *    title = {Implementation of a String Kernel for WEKA},
0051:         *    year = {2005}
0052:         * }
0053:         * </pre>
0054:         * <p/>
0055:         <!-- technical-bibtex-end -->
0056:         * 
0057:         <!-- options-start -->
0058:         * Valid options are: <p/>
0059:         * 
0060:         * <pre> -D
0061:         *  Enables debugging output (if available) to be printed.
0062:         *  (default: off)</pre>
0063:         * 
0064:         * <pre> -no-checks
0065:         *  Turns off all checks - use with caution!
0066:         *  (default: checks on)</pre>
0067:         * 
0068:         * <pre> -P &lt;0|1&gt;
0069:         *  The pruning method to use:
0070:         *  0 = No pruning
0071:         *  1 = Lambda pruning
0072:         *  (default: 0)</pre>
0073:         * 
0074:         * <pre> -C &lt;num&gt;
0075:         *  The size of the cache (a prime number).
0076:         *  (default: 250007)</pre>
0077:         * 
0078:         * <pre> -IC &lt;num&gt;
0079:         *  The size of the internal cache (a prime number).
0080:         *  (default: 200003)</pre>
0081:         * 
0082:         * <pre> -L &lt;num&gt;
0083:         *  The lambda constant. Penalizes non-continuous subsequence
0084:         *  matches. Must be in (0,1).
0085:         *  (default: 0.5)</pre>
0086:         * 
0087:         * <pre> -ssl &lt;num&gt;
0088:         *  The length of the subsequence.
0089:         *  (default: 3)</pre>
0090:         * 
0091:         * <pre> -ssl-max &lt;num&gt;
0092:         *  The maximum length of the subsequence.
0093:         *  (default: 9)</pre>
0094:         * 
0095:         * <pre> -N
0096:         *  Use normalization.
0097:         *  (default: no)</pre>
0098:         * 
0099:         <!-- options-end -->
0100:         * 
0101:         * <h1>Theory</h1>
0102:         * <h2>Overview</h2>
0103:         * The algorithm computes a measure of similarity between two texts based on
0104:         * the number and form of their common subsequences, which need not be
0105:         * contiguous. This method can be parametrized by specifying the subsequence
0106:         * length k, the penalty factor lambda, which penalizes non-contiguous matches,
0107:         * and optional 'lambda pruning', which takes maxLambdaExponent,
0108:         * <code>m</code>, as parameter. Lambda pruning causes very 'stretched'
0109:         * substring matches not to be counted, thus speeding up the computation. The
0110:         * functionality of SSK and SSK-LP is explained in the following using simple
0111:         * examples.
0112:         * 
0113:         * <h2>Explanation &amp; Examples</h2>
0114:         * for all of the following examples, we assume these parameter values: 
0115:         *<pre> 
0116:         *k=2
0117:         *lambda=0.5
0118:         *m=8 (for SSK-LP examples)
0119:         *</pre>
0120:         * 
0121:         * <h3>SSK</h3>
0122:         * 
0123:         * <h4>Example 1</h4>
0124:         * 
0125:         * <pre>
0126:         *SSK(2,"ab","axb")=0.5^5 = 0,03125
0127:         *</pre>
0128:         * There is one subsequence of the length of 2 that both strings have in
0129:         * common, "ab".  The result of SSK is computed by raising lambda to the power
0130:         * of L, where L is the length of the subsequence match in the one string plus
0131:         * the length of the subsequence match in the other, in our case:
0132:         * <pre>
0133:         *&nbsp;  ab    axb
0134:         *L= 2  +   3 = 5
0135:         * </pre>
0136:         * hence, the kernel yields 0.5^5 = 0,03125
0137:         *
0138:         * <h4>Example 2</h4>
0139:         * <pre>
0140:         *SSK(2,"ab","abb")=0.5^5 + 0.5^4 = 0,09375
0141:         *</pre>
0142:         * Here, we also have one subsequence of the length of 2 that both strings have
0143:         * in common, "ab".  The result of SSK is actually computed by summing over all
0144:         * values computed for each occurrence of a common subsequence match. In this
0145:         * example, there are two possible cases:
0146:         * <pre>
0147:         *ab    abb
0148:         *--    --  L=4
0149:         *--    - - L=5
0150:         * </pre>
0151:         * we have two matches, one of the length of 2+2=4, one of the length of 2+3=5, 
0152:         * so we get the result 0.5^5 + 0.5^4 = 0,09375.
0153:         *
0154:         * <h3>SSK-LP</h3>
0155:         * Without lambda pruning, the string kernel finds *all* common subsequences of
0156:         * the given length, whereas with lambda pruning, common subsequence matches
0157:         * that are too much stretched in both strings are not taken into account. It
0158:         * is argued that the value yielded for such a common subsequence is too low
0159:         * (<code>lambda ^(length[match_in_s] + length[match_in_t]</code>) . Tests have
0160:         * shown that a tremendous speedup can be achieved using this technique while
0161:         * suffering from very little quality loss. <br>
0162:         * Lambda pruning is parametrized by the maximum lambda exponent. It is a good
0163:         * idea to choose that value to be about 3 or 4 times the subsequence length as
0164:         * a rule of thumb. YMMV.
0165:         *
0166:         * <h4>Example 3</h4>
0167:         * Without lambda pruning, one common subsequence, 
0168:         * "AB" would be found in the following two strings. (With k=2)  
0169:         * <pre>
0170:         *SSK(2,"ab","axb")=0.5^14 = 0,00006103515625
0171:         *</pre>
0172:         * lambda pruning allows for the control of the match length. So, if m 
0173:         * (the maximum lambda exponent) is e.g. 8, these two strings would 
0174:         * yield a kernel value of 0:
0175:         * <pre>
0176:         *with lambda pruning:    SSK-LP(2,8,"AxxxxxxxxxB","AyB")= 0
0177:         *without lambda pruning: SSK(2,"AxxxxxxxxxB","AyB")= 0.5^14 = 0,00006103515625  
0178:         *</pre>
0179:         * This is because the exponent for lambda (=the length of the subsequence
0180:         * match) would be 14, which is &gt; 8. In Contrast, the next result is
0181:         * &gt; 0
0182:         *<pre>
0183:         *m=8
0184:         *SSK-LP(2,8,"AxxB","AyyB")=0.5^8 = 0,00390625
0185:         *</pre>
0186:         * because the lambda exponent would be 8, which is just accepted by lambda
0187:         * pruning.
0188:         *
0189:         * <h3>Normalization</h3>
0190:         * When the string kernel is used for its main purpose, as the kernel of a
0191:         * support vector machine, it is not normalized.  The normalized kernel can be
0192:         * switched on by -F (feature space normalization) but is much slower.  Like
0193:         * most unnormalized kernels, K(x,x) is not a fixed value, see the next
0194:         * example.
0195:         *
0196:         * <h4>Example 4</h4> 
0197:         *<pre>
0198:         *SSK(2,"ab","ab")=0.5^4 = 0.0625
0199:         *SSK(2,"AxxxxxxxxxB","AxxxxxxxxxB") = 12.761724710464478
0200:         *</pre>
0201:         * SSK is evaluated twice, each time for two identical strings. A good measure
0202:         * of similarity would produce the same value in both cases, which should  
0203:         * indicate the same level of similarity. The value of the normalized SSK would
0204:         * be 1.0 in both cases. So for the purpose of computing string similarity the
0205:         * normalized kernel should be used. For SVM the unnormalized kernel is usually
0206:         * sufficient.
0207:         *
0208:         * <h2>Complexity of SSK and SSK-LP</h2>
0209:         * The time complexity of this method (without lambda pruning and with an
0210:         * infinitely large cache) is<br> 
0211:         * <pre>O(k*|s|*|t|)</pre>
0212:         * Lambda Pruning has a complexity (without caching) of<br> 
0213:         * <pre>O(m*binom(m,k)^2*(|s|+n)*|t|)</pre> <br>  
0214:         * <pre>
0215:         *k...          subsequence length (ssl)
0216:         *s,t...        strings
0217:         *|s|...        length of string s
0218:         *binom(x,y)... binomial coefficient (x!/[(x-y)!y!])
0219:         *m...          maxLambdaExponent (ssl-max)
0220:         *</pre>
0221:         * 
0222:         * Keep in mind that execution time can increase fast for long strings 
0223:         * and big values for k, especially if you don't use lambda pruning.
0224:         * With lambda pruning, computation is usually so fast that switching
0225:         * on the cache leads to slower computation because of setup costs. Therefore
0226:         * caching is switched off for lambda pruning.
0227:         * <br>
0228:         * <br>
0229:         * For details and qualitative experiments about SSK, see [1] <br>
0230:         * For details about lambda pruning and performance comparison of SSK 
0231:         * and SSK-LP (SSK with lambda pruning), see [2]   
0232:         * Note that the complexity estimation in [2] assumes no caching of
0233:         * intermediate results, which has been implemented in the meantime and
0234:         * greatly improves the speed of the SSK without lambda pruning.
0235:         *<br>
0236:         *
0237:         *<h1>Notes for usage within Weka</h1>
0238:         * Only instances of the following form can be processed using string kernels:
0239:         * <pre>
0240:         *+----------+-------------+---------------+
0241:         *|attribute#|     0       |       1       |
0242:         *+----------+-------------+---------------+
0243:         *| content  | [text data] | [class label] |
0244:         *+----------------------------------------+
0245:         * ... or ...
0246:         *+----------+---------------+-------------+
0247:         *|attribute#|     0         |     1       |
0248:         *+----------+---------------+-------------+
0249:         *| content  | [class label] | [text data] |
0250:         *+----------------------------------------+
0251:         *</pre>
0252:         *
0253:         * @author Florian Kleedorfer (kleedorfer@austria.fm)
0254:         * @author Alexander K. Seewald (alex@seewald.at)
0255:         * @version $Revision: 1.5 $
0256:         */
0257:        public class StringKernel extends Kernel implements 
0258:                TechnicalInformationHandler {
0259:
0260:            /** for serialization */
0261:            private static final long serialVersionUID = -4902954211202690123L;
0262:
0263:            /** The size of the cache (a prime number) */
0264:            private int m_cacheSize = 250007;
0265:
0266:            /** The size of the internal cache for intermediate results (a prime number) */
0267:            private int m_internalCacheSize = 200003;
0268:
0269:            /** The attribute number of the string attribute */
0270:            private int m_strAttr;
0271:
0272:            /** Kernel cache (i.e., cache for kernel evaluations) */
0273:            private double[] m_storage;
0274:            private long[] m_keys;
0275:
0276:            /** Counts the number of kernel evaluations. */
0277:            private int m_kernelEvals;
0278:
0279:            /** The number of instance in the dataset */
0280:            private int m_numInsts;
0281:
0282:            /** Pruning method: No Pruning */
0283:            public final static int PRUNING_NONE = 0;
0284:            /** Pruning method: Lambda See [2] for details. */
0285:            public final static int PRUNING_LAMBDA = 1;
0286:            /** Pruning methods */
0287:            public static final Tag[] TAGS_PRUNING = {
0288:                    new Tag(PRUNING_NONE, "No pruning"),
0289:                    new Tag(PRUNING_LAMBDA, "Lambda pruning"), };
0290:
0291:            /** the pruning method */
0292:            protected int m_PruningMethod = PRUNING_NONE;
0293:
0294:            /** the decay factor that penalizes non-continuous substring matches. See [1]
0295:             * for details. */
0296:            protected double m_lambda = 0.5;
0297:
0298:            /** The substring length */
0299:            private int m_subsequenceLength = 3;
0300:
0301:            /** The maximum substring length for lambda pruning */
0302:            private int m_maxSubsequenceLength = 9;
0303:
0304:            /** powers of lambda are prepared prior to kernel evaluations.
0305:             * all powers between 0 and this value are precalculated */
0306:            protected static final int MAX_POWER_OF_LAMBDA = 10000;
0307:
0308:            /** the precalculated powers of lambda */
0309:            protected double[] m_powersOflambda = null;
0310:
0311:            /** flag for switching normalization on or off. This defaults to false and
0312:             * can be turned on by the switch for feature space normalization in SMO
0313:             */
0314:            private boolean m_normalize = false;
0315:
0316:            /** private cache for intermediate results */
0317:            private int maxCache; // is set in unnormalizedKernel(s1,s2)
0318:            private double[] cachekh;
0319:            private int[] cachekhK;
0320:            private double[] cachekh2;
0321:            private int[] cachekh2K;
0322:            /** cached indexes for private cache */
0323:            private int m_multX;
0324:            private int m_multY;
0325:            private int m_multZ;
0326:            private int m_multZZ;
0327:
0328:            private boolean m_useRecursionCache = true;
0329:
0330:            /**
0331:             * default constructor
0332:             */
0333:            public StringKernel() {
0334:                super ();
0335:            }
0336:
0337:            /**
0338:             * creates a new StringKernel object. Initializes the kernel cache and the
0339:             * 'lambda cache', i.e. the precalculated powers of lambda from lambda^2 to
0340:             * lambda^MAX_POWER_OF_LAMBDA
0341:             *
0342:             * @param data		the dataset to use
0343:             * @param cacheSize		the size of the cache
0344:             * @param subsequenceLength	the subsequence length
0345:             * @param lambda		the lambda value
0346:             * @param debug		whether to output debug information
0347:             * @throws Exception		if something goes wrong
0348:             */
0349:            public StringKernel(Instances data, int cacheSize,
0350:                    int subsequenceLength, double lambda, boolean debug)
0351:                    throws Exception {
0352:
0353:                setDebug(debug);
0354:                setCacheSize(cacheSize);
0355:                setInternalCacheSize(200003);
0356:                setSubsequenceLength(subsequenceLength);
0357:                setMaxSubsequenceLength(-1);
0358:                setLambda(lambda);
0359:
0360:                buildKernel(data);
0361:            }
0362:
0363:            /**
0364:             * Returns a string describing the kernel
0365:             * 
0366:             * @return a description suitable for displaying in the
0367:             *         explorer/experimenter gui
0368:             */
0369:            public String globalInfo() {
0370:                return "Implementation of the subsequence kernel (SSK) as described in [1] "
0371:                        + "and of the subsequence kernel with lambda pruning (SSK-LP) as "
0372:                        + "described in [2].\n\n"
0373:                        + "For more information, see\n\n"
0374:                        + getTechnicalInformation().toString();
0375:            }
0376:
0377:            /**
0378:             * Returns an instance of a TechnicalInformation object, containing 
0379:             * detailed information about the technical background of this class,
0380:             * e.g., paper reference or book this class is based on.
0381:             * 
0382:             * @return the technical information about this class
0383:             */
0384:            public TechnicalInformation getTechnicalInformation() {
0385:                TechnicalInformation result;
0386:                TechnicalInformation additional;
0387:
0388:                result = new TechnicalInformation(Type.ARTICLE);
0389:                result
0390:                        .setValue(
0391:                                Field.AUTHOR,
0392:                                "Huma Lodhi and Craig Saunders and John Shawe-Taylor and Nello Cristianini and Christopher J. C. H. Watkins");
0393:                result.setValue(Field.YEAR, "2002");
0394:                result.setValue(Field.TITLE,
0395:                        "Text Classification using String Kernels");
0396:                result.setValue(Field.JOURNAL,
0397:                        "Journal of Machine Learning Research");
0398:                result.setValue(Field.VOLUME, "2");
0399:                result.setValue(Field.PAGES, "419-444");
0400:                result.setValue(Field.HTTP,
0401:                        "http://www.jmlr.org/papers/v2/lodhi02a.html");
0402:
0403:                additional = result.add(Type.TECHREPORT);
0404:                additional.setValue(Field.AUTHOR,
0405:                        "F. Kleedorfer and A. Seewald");
0406:                additional.setValue(Field.YEAR, "2005");
0407:                additional.setValue(Field.TITLE,
0408:                        "Implementation of a String Kernel for WEKA");
0409:                additional
0410:                        .setValue(Field.INSTITUTION,
0411:                                "Oesterreichisches Forschungsinstitut fuer Artificial Intelligence");
0412:                additional.setValue(Field.ADDRESS, "Wien, Austria");
0413:                additional.setValue(Field.NUMBER, "TR-2005-13");
0414:
0415:                return result;
0416:            }
0417:
0418:            /**
0419:             * Returns an enumeration describing the available options.
0420:             *
0421:             * @return 		an enumeration of all the available options.
0422:             */
0423:            public Enumeration listOptions() {
0424:                Vector result;
0425:                Enumeration en;
0426:                String desc;
0427:                String param;
0428:                int i;
0429:                SelectedTag tag;
0430:
0431:                result = new Vector();
0432:
0433:                en = super .listOptions();
0434:                while (en.hasMoreElements())
0435:                    result.addElement(en.nextElement());
0436:
0437:                desc = "";
0438:                param = "";
0439:                for (i = 0; i < TAGS_PRUNING.length; i++) {
0440:                    if (i > 0)
0441:                        param += "|";
0442:                    tag = new SelectedTag(TAGS_PRUNING[i].getID(), TAGS_PRUNING);
0443:                    param += "" + tag.getSelectedTag().getID();
0444:                    desc += "\t" + tag.getSelectedTag().getID() + " = "
0445:                            + tag.getSelectedTag().getReadable() + "\n";
0446:                }
0447:
0448:                result.addElement(new Option("\tThe pruning method to use:\n"
0449:                        + desc + "\t(default: " + PRUNING_NONE + ")", "P", 1,
0450:                        "-P <" + param + ">"));
0451:
0452:                result.addElement(new Option(
0453:                        "\tThe size of the cache (a prime number).\n"
0454:                                + "\t(default: 250007)", "C", 1, "-C <num>"));
0455:
0456:                result.addElement(new Option(
0457:                        "\tThe size of the internal cache (a prime number).\n"
0458:                                + "\t(default: 200003)", "IC", 1, "-IC <num>"));
0459:
0460:                result.addElement(new Option(
0461:                        "\tThe lambda constant. Penalizes non-continuous subsequence\n"
0462:                                + "\tmatches. Must be in (0,1).\n"
0463:                                + "\t(default: 0.5)", "L", 1, "-L <num>"));
0464:
0465:                result
0466:                        .addElement(new Option(
0467:                                "\tThe length of the subsequence.\n"
0468:                                        + "\t(default: 3)", "ssl", 1,
0469:                                "-ssl <num>"));
0470:
0471:                result.addElement(new Option(
0472:                        "\tThe maximum length of the subsequence.\n"
0473:                                + "\t(default: 9)", "ssl-max", 1,
0474:                        "-ssl-max <num>"));
0475:
0476:                result.addElement(new Option("\tUse normalization.\n"
0477:                        + "\t(default: no)", "N", 0, "-N"));
0478:
0479:                return result.elements();
0480:            }
0481:
0482:            /**
0483:             * Parses a given list of options. <p/>
0484:             * 
0485:             <!-- options-start -->
0486:             * Valid options are: <p/>
0487:             * 
0488:             * <pre> -D
0489:             *  Enables debugging output (if available) to be printed.
0490:             *  (default: off)</pre>
0491:             * 
0492:             * <pre> -no-checks
0493:             *  Turns off all checks - use with caution!
0494:             *  (default: checks on)</pre>
0495:             * 
0496:             * <pre> -P &lt;0|1&gt;
0497:             *  The pruning method to use:
0498:             *  0 = No pruning
0499:             *  1 = Lambda pruning
0500:             *  (default: 0)</pre>
0501:             * 
0502:             * <pre> -C &lt;num&gt;
0503:             *  The size of the cache (a prime number).
0504:             *  (default: 250007)</pre>
0505:             * 
0506:             * <pre> -IC &lt;num&gt;
0507:             *  The size of the internal cache (a prime number).
0508:             *  (default: 200003)</pre>
0509:             * 
0510:             * <pre> -L &lt;num&gt;
0511:             *  The lambda constant. Penalizes non-continuous subsequence
0512:             *  matches. Must be in (0,1).
0513:             *  (default: 0.5)</pre>
0514:             * 
0515:             * <pre> -ssl &lt;num&gt;
0516:             *  The length of the subsequence.
0517:             *  (default: 3)</pre>
0518:             * 
0519:             * <pre> -ssl-max &lt;num&gt;
0520:             *  The maximum length of the subsequence.
0521:             *  (default: 9)</pre>
0522:             * 
0523:             * <pre> -N
0524:             *  Use normalization.
0525:             *  (default: no)</pre>
0526:             * 
0527:             <!-- options-end -->
0528:             * 
0529:             * @param options 	the list of options as an array of strings
0530:             * @throws Exception 	if an option is not supported
0531:             */
0532:            public void setOptions(String[] options) throws Exception {
0533:                String tmpStr;
0534:
0535:                tmpStr = Utils.getOption('P', options);
0536:                if (tmpStr.length() != 0)
0537:                    setPruningMethod(new SelectedTag(Integer.parseInt(tmpStr),
0538:                            TAGS_PRUNING));
0539:                else
0540:                    setPruningMethod(new SelectedTag(PRUNING_NONE, TAGS_PRUNING));
0541:
0542:                tmpStr = Utils.getOption('C', options);
0543:                if (tmpStr.length() != 0)
0544:                    setCacheSize(Integer.parseInt(tmpStr));
0545:                else
0546:                    setCacheSize(250007);
0547:
0548:                tmpStr = Utils.getOption("IC", options);
0549:                if (tmpStr.length() != 0)
0550:                    setInternalCacheSize(Integer.parseInt(tmpStr));
0551:                else
0552:                    setInternalCacheSize(200003);
0553:
0554:                tmpStr = Utils.getOption('L', options);
0555:                if (tmpStr.length() != 0)
0556:                    setLambda(Double.parseDouble(tmpStr));
0557:                else
0558:                    setLambda(0.5);
0559:
0560:                tmpStr = Utils.getOption("ssl", options);
0561:                if (tmpStr.length() != 0)
0562:                    setSubsequenceLength(Integer.parseInt(tmpStr));
0563:                else
0564:                    setSubsequenceLength(3);
0565:
0566:                tmpStr = Utils.getOption("ssl-max", options);
0567:                if (tmpStr.length() != 0)
0568:                    setMaxSubsequenceLength(Integer.parseInt(tmpStr));
0569:                else
0570:                    setMaxSubsequenceLength(9);
0571:
0572:                setUseNormalization(Utils.getFlag('N', options));
0573:
0574:                if (getMaxSubsequenceLength() < 2 * getSubsequenceLength()) {
0575:                    throw new IllegalArgumentException(
0576:                            "Lambda Pruning forbids even contiguous substring matches! "
0577:                                    + "Use a bigger value for ssl-max (at least 2*ssl).");
0578:                }
0579:
0580:                super .setOptions(options);
0581:            }
0582:
0583:            /**
0584:             * Gets the current settings of the Kernel.
0585:             *
0586:             * @return an array of strings suitable for passing to setOptions
0587:             */
0588:            public String[] getOptions() {
0589:                int i;
0590:                Vector result;
0591:                String[] options;
0592:
0593:                result = new Vector();
0594:                options = super .getOptions();
0595:                for (i = 0; i < options.length; i++)
0596:                    result.add(options[i]);
0597:
0598:                result.add("-P");
0599:                result.add("" + m_PruningMethod);
0600:
0601:                result.add("-C");
0602:                result.add("" + getCacheSize());
0603:
0604:                result.add("-IC");
0605:                result.add("" + getInternalCacheSize());
0606:
0607:                result.add("-L");
0608:                result.add("" + getLambda());
0609:
0610:                result.add("-ssl");
0611:                result.add("" + getSubsequenceLength());
0612:
0613:                result.add("-ssl-max");
0614:                result.add("" + getMaxSubsequenceLength());
0615:
0616:                if (getUseNormalization())
0617:                    result.add("-L");
0618:
0619:                return (String[]) result.toArray(new String[result.size()]);
0620:            }
0621:
0622:            /**
0623:             * Returns the tip text for this property
0624:             * 
0625:             * @return 		tip text for this property suitable for
0626:             * 			displaying in the explorer/experimenter gui
0627:             */
0628:            public String pruningMethodTipText() {
0629:                return "The pruning method.";
0630:            }
0631:
0632:            /**
0633:             * Sets the method used to for pruning. 
0634:             *
0635:             * @param value 	the pruning method to use.
0636:             */
0637:            public void setPruningMethod(SelectedTag value) {
0638:                if (value.getTags() == TAGS_PRUNING)
0639:                    m_PruningMethod = value.getSelectedTag().getID();
0640:            }
0641:
0642:            /**
0643:             * Gets the method used for pruning. 
0644:             *
0645:             * @return 		the pruning method to use.
0646:             */
0647:            public SelectedTag getPruningMethod() {
0648:                return new SelectedTag(m_PruningMethod, TAGS_PRUNING);
0649:            }
0650:
0651:            /**
0652:             * Sets the size of the cache to use (a prime number)
0653:             * 
0654:             * @param value	the size of the cache
0655:             */
0656:            public void setCacheSize(int value) {
0657:                if (value >= 0) {
0658:                    m_cacheSize = value;
0659:                    clean();
0660:                } else {
0661:                    System.out
0662:                            .println("Cache size cannot be smaller than 0 (provided: "
0663:                                    + value + ")!");
0664:                }
0665:            }
0666:
0667:            /**
0668:             * Gets the size of the cache
0669:             * 
0670:             * @return 		the cache size
0671:             */
0672:            public int getCacheSize() {
0673:                return m_cacheSize;
0674:            }
0675:
0676:            /**
0677:             * Returns the tip text for this property
0678:             * 
0679:             * @return 		tip text for this property suitable for
0680:             * 			displaying in the explorer/experimenter gui
0681:             */
0682:            public String cacheSizeTipText() {
0683:                return "The size of the cache (a prime number).";
0684:            }
0685:
0686:            /** 
0687:             * sets the size of the internal cache for intermediate results. Memory
0688:             * consumption is about 16x this amount in bytes. Only use when lambda
0689:             * pruning is switched off.
0690:             *
0691:             * @param value	the size of the internal cache
0692:             */
0693:            public void setInternalCacheSize(int value) {
0694:                if (value >= 0) {
0695:                    m_internalCacheSize = value;
0696:                    clean();
0697:                } else {
0698:                    System.out
0699:                            .println("Cache size cannot be smaller than 0 (provided: "
0700:                                    + value + ")!");
0701:                }
0702:            }
0703:
0704:            /**
0705:             * Gets the size of the internal cache
0706:             * 
0707:             * @return 		the cache size
0708:             */
0709:            public int getInternalCacheSize() {
0710:                return m_internalCacheSize;
0711:            }
0712:
0713:            /**
0714:             * Returns the tip text for this property
0715:             * 
0716:             * @return 		tip text for this property suitable for
0717:             * 			displaying in the explorer/experimenter gui
0718:             */
0719:            public String internalCacheSizeTipText() {
0720:                return "The size of the internal cache (a prime number).";
0721:            }
0722:
0723:            /**
0724:             * Sets the length of the subsequence.
0725:             * 
0726:             * @param value	the length
0727:             */
0728:            public void setSubsequenceLength(int value) {
0729:                m_subsequenceLength = value;
0730:            }
0731:
0732:            /**
0733:             * Returns the length of the subsequence
0734:             * 
0735:             * @return		the length
0736:             */
0737:            public int getSubsequenceLength() {
0738:                return m_subsequenceLength;
0739:            }
0740:
0741:            /**
0742:             * Returns the tip text for this property
0743:             * 
0744:             * @return 		tip text for this property suitable for
0745:             * 			displaying in the explorer/experimenter gui
0746:             */
0747:            public String subsequenceLengthTipText() {
0748:                return "The subsequence length.";
0749:            }
0750:
0751:            /**
0752:             * Sets the maximum length of the subsequence.
0753:             * 
0754:             * @param value	the maximum length
0755:             */
0756:            public void setMaxSubsequenceLength(int value) {
0757:                m_maxSubsequenceLength = value;
0758:            }
0759:
0760:            /**
0761:             * Returns the maximum length of the subsequence
0762:             * 
0763:             * @return		the maximum length
0764:             */
0765:            public int getMaxSubsequenceLength() {
0766:                return m_maxSubsequenceLength;
0767:            }
0768:
0769:            /**
0770:             * Returns the tip text for this property
0771:             * 
0772:             * @return 		tip text for this property suitable for
0773:             * 			displaying in the explorer/experimenter gui
0774:             */
0775:            public String maxSubsequenceLengthTipText() {
0776:                return "The maximum subsequence length (theta in the paper)";
0777:            }
0778:
0779:            /**
0780:             * Sets the lambda constant used in the string kernel
0781:             * 
0782:             * @param value	the lambda value to use
0783:             */
0784:            public void setLambda(double value) {
0785:                m_lambda = value;
0786:            }
0787:
0788:            /**
0789:             * Gets the lambda constant used in the string kernel
0790:             * 
0791:             * @return		the current lambda constant
0792:             */
0793:            public double getLambda() {
0794:                return m_lambda;
0795:            }
0796:
0797:            /**
0798:             * Returns the tip text for this property
0799:             * 
0800:             * @return 		tip text for this property suitable for
0801:             * 			displaying in the explorer/experimenter gui
0802:             */
0803:            public String lambdaTipText() {
0804:                return "Penalizes non-continuous subsequence matches, from (0,1)";
0805:            }
0806:
0807:            /**
0808:             * Sets whether to use normalization.
0809:             * Each time this value is changed, the kernel cache is cleared.
0810:             *
0811:             * @param value	whether to use normalization
0812:             */
0813:            public void setUseNormalization(boolean value) {
0814:                if (value != m_normalize)
0815:                    clean();
0816:
0817:                m_normalize = value;
0818:            }
0819:
0820:            /**
0821:             * Returns whether normalization is used.
0822:             * 
0823:             * @return		true if normalization is used
0824:             */
0825:            public boolean getUseNormalization() {
0826:                return m_normalize;
0827:            }
0828:
0829:            /**
0830:             * Returns the tip text for this property
0831:             * 
0832:             * @return 		tip text for this property suitable for
0833:             * 			displaying in the explorer/experimenter gui
0834:             */
0835:            public String useNormalizationTipText() {
0836:                return "Whether to use normalization.";
0837:            }
0838:
0839:            /**
0840:             * Computes the result of the kernel function for two instances.
0841:             * If id1 == -1, eval use inst1 instead of an instance in the dataset.
0842:             *
0843:             * @param id1 the index of the first instance in the dataset
0844:             * @param id2 the index of the second instance in the dataset
0845:             * @param inst1 the instance corresponding to id1 (used if id1 == -1)
0846:             * @return the result of the kernel function
0847:             * @throws Exception if something goes wrong
0848:             */
0849:            public double eval(int id1, int id2, Instance inst1)
0850:                    throws Exception {
0851:                if (m_Debug && id1 > -1 && id2 > -1) {
0852:                    System.err.println("\nEvaluation of string kernel for");
0853:                    System.err.println(m_data.instance(id1).stringValue(
0854:                            m_strAttr));
0855:                    System.err.println("and");
0856:                    System.err.println(m_data.instance(id2).stringValue(
0857:                            m_strAttr));
0858:                }
0859:
0860:                //the normalized kernel returns 1 for comparison of 
0861:                //two identical strings
0862:                if (id1 == id2 && m_normalize)
0863:                    return 1.0;
0864:
0865:                double result = 0;
0866:                long key = -1;
0867:                int location = -1;
0868:
0869:                // we can only cache if we know the indexes
0870:                if ((id1 >= 0) && (m_keys != null)) {
0871:                    if (id1 > id2) {
0872:                        key = (long) id1 * m_numInsts + id2;
0873:                    } else {
0874:                        key = (long) id2 * m_numInsts + id1;
0875:                    }
0876:                    if (key < 0) {
0877:                        throw new Exception("Cache overflow detected!");
0878:                    }
0879:                    location = (int) (key % m_keys.length);
0880:                    if (m_keys[location] == (key + 1)) {
0881:                        if (m_Debug)
0882:                            System.err.println("result (cached): "
0883:                                    + m_storage[location]);
0884:                        return m_storage[location];
0885:                    }
0886:                }
0887:
0888:                m_kernelEvals++;
0889:                long start = System.currentTimeMillis();
0890:
0891:                Instance inst2 = m_data.instance(id2);
0892:                char[] s1 = inst1.stringValue(m_strAttr).toCharArray();
0893:                char[] s2 = inst2.stringValue(m_strAttr).toCharArray();
0894:
0895:                // prevent the kernel from returning NaN
0896:                if (s1.length == 0 || s2.length == 0)
0897:                    return 0;
0898:
0899:                if (m_normalize) {
0900:                    result = normalizedKernel(s1, s2);
0901:                } else {
0902:                    result = unnormalizedKernel(s1, s2);
0903:                }
0904:
0905:                if (m_Debug) {
0906:                    long duration = System.currentTimeMillis() - start;
0907:                    System.err.println("result: " + result);
0908:                    System.err.println("evaluation time:" + duration + "\n");
0909:                }
0910:
0911:                // store result in cache
0912:                if (key != -1) {
0913:                    m_storage[location] = result;
0914:                    m_keys[location] = (key + 1);
0915:                }
0916:                return result;
0917:            }
0918:
0919:            /**
0920:             * Frees the memory used by the kernel.
0921:             * (Useful with kernels which use cache.)
0922:             * This function is called when the training is done.
0923:             * i.e. after that, eval will be called with id1 == -1.
0924:             */
0925:            public void clean() {
0926:                m_storage = null;
0927:                m_keys = null;
0928:            }
0929:
0930:            /**
0931:             * Returns the number of kernel evaluation performed.
0932:             *
0933:             * @return the number of kernel evaluation performed.
0934:             */
0935:            public int numEvals() {
0936:                return m_kernelEvals;
0937:            }
0938:
0939:            /**
0940:             * Returns the number of dot product cache hits.
0941:             *
0942:             * @return the number of dot product cache hits, or -1 if not supported by
0943:             * this kernel.
0944:             */
0945:            public int numCacheHits() {
0946:                // TODO: implement!
0947:                return -1;
0948:            }
0949:
0950:            /**
0951:             * evaluates the normalized kernel between s and t. See [1] for details about
0952:             * the normalized SSK. 
0953:             *
0954:             * @param s first input string
0955:             * @param t second input string
0956:             * @return a double indicating their distance, or similarity
0957:             */
0958:            public double normalizedKernel(char[] s, char[] t) {
0959:                double k1 = unnormalizedKernel(s, s);
0960:                double k2 = unnormalizedKernel(t, t);
0961:                double normTerm = Math.sqrt(k1 * k2);
0962:                return unnormalizedKernel(s, t) / normTerm;
0963:            }
0964:
0965:            /**
0966:             * evaluates the unnormalized kernel between s and t. See [1] for details
0967:             * about the unnormalized SSK.
0968:             *
0969:             * @param s first input string
0970:             * @param t second input string
0971:             * @return a double indicating their distance, or similarity
0972:             */
0973:            public double unnormalizedKernel(char[] s, char[] t) {
0974:                if (t.length > s.length) {
0975:                    //swap because the algorithm is faster if s is
0976:                    //the longer string
0977:                    char[] buf = s;
0978:                    s = t;
0979:                    t = buf;
0980:                }
0981:                if (m_PruningMethod == PRUNING_NONE) {
0982:                    m_multX = (s.length + 1) * (t.length + 1);
0983:                    m_multY = (t.length + 1);
0984:                    m_multZ = 1;
0985:                    maxCache = m_internalCacheSize;
0986:                    if (maxCache == 0) {
0987:                        maxCache = (m_subsequenceLength + 1) * m_multX;
0988:                    } else if ((m_subsequenceLength + 1) * m_multX < maxCache) {
0989:                        maxCache = (m_subsequenceLength + 1) * m_multX;
0990:                    }
0991:                    m_useRecursionCache = true;
0992:                    cachekhK = new int[maxCache];
0993:                    cachekh2K = new int[maxCache];
0994:                    cachekh = new double[maxCache];
0995:                    cachekh2 = new double[maxCache];
0996:                } else if (m_PruningMethod == PRUNING_LAMBDA) {
0997:                    maxCache = 0;
0998:                    m_useRecursionCache = false;
0999:                }
1000:
1001:                double res;
1002:                if (m_PruningMethod == PRUNING_LAMBDA) {
1003:                    res = kernelLP(m_subsequenceLength, s, s.length - 1, t,
1004:                            t.length - 1, m_maxSubsequenceLength);
1005:                } else {
1006:                    res = kernel(m_subsequenceLength, s, s.length - 1, t,
1007:                            t.length - 1);
1008:                }
1009:                cachekh = null;
1010:                cachekhK = null;
1011:                cachekh2 = null;
1012:                cachekh2K = null;
1013:
1014:                return res;
1015:            }
1016:
1017:            /**
1018:             * Recursion-ending function that is called at the end of each 
1019:             * recursion branch.
1020:             * 
1021:             * @param n
1022:             * @return
1023:             */
1024:            protected double getReturnValue(int n) {
1025:                if (n == 0)
1026:                    return 1;
1027:                else
1028:                    return 0;
1029:            }
1030:
1031:            /**
1032:             * the kernel function (Kn). This function performs the outer loop
1033:             * character-wise over the first input string s. For each character
1034:             * encountered, a recursion branch is started that identifies all
1035:             * subsequences in t starting with that character. <br> See [1] for details
1036:             * but note that this code is optimized and may be hard to recognize.
1037:             * 
1038:             * @param n the current length of the matching subsequence
1039:             * @param s first string, as a char array
1040:             * @param t second string, as a char array
1041:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1042:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1043:             * @return a double indicating the distance or similarity between s and t, 
1044:             * according to and depending on the initial value for n.
1045:             */
1046:            protected double kernel(int n, char[] s, int endIndexS, char[] t,
1047:                    int endIndexT) {
1048:
1049:                //normal recursion ending case
1050:                if (Math.min(endIndexS + 1, endIndexT + 1) < n)
1051:                    return getReturnValue(n);
1052:
1053:                //accumulate all recursion results in one:
1054:                double result = 0;
1055:
1056:                //the tail-recursive function defined in [1] is turned into a
1057:                //loop here, preventing stack overflows.
1058:                //skim s from back to front
1059:                for (int iS = endIndexS; iS > n - 2; iS--) {
1060:                    double buf = 0;
1061:                    //let the current character in s be x 
1062:                    char x = s[iS];
1063:                    // iterate over all occurrences of x in t
1064:                    for (int j = 0; j <= endIndexT; j++) {
1065:                        if (t[j] == x) {
1066:                            //this is a match for the current character, hence
1067:                            //1. use previous chars in both strings (iS-1, j-1)
1068:                            //2. decrement the remainingMatchLength (n-1)
1069:                            //and start a recursion branch for these parameters
1070:                            buf += kernelHelper(n - 1, s, iS - 1, t, j - 1);
1071:                        }
1072:                    }
1073:                    //ok, all occurrences of x in t have been found
1074:                    //multiply the result with lambda^2
1075:                    //  (one lambda for x, and the other for all matches of x in t)
1076:                    result += buf * m_powersOflambda[2];
1077:                }
1078:                return result;
1079:            }
1080:
1081:            /**
1082:             * The kernel helper function, called K' in [1] and [2].
1083:             *
1084:             * @param n the current length of the matching subsequence
1085:             * @param s first string, as a char array
1086:             * @param t second string, as a char array
1087:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1088:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1089:             * @return a partial result for K
1090:             */
1091:            protected double kernelHelper(int n, char[] s, int endIndexS,
1092:                    char[] t, int endIndexT) {
1093:
1094:                //recursion ends if the current subsequence has maximal length, 
1095:                //which is the case here
1096:                if (n <= 0) {
1097:                    return getReturnValue(n);
1098:                }
1099:
1100:                //recursion ends, too, if the current subsequence is shorter than
1101:                //maximal length, but there is no chance that it will reach maximal length.
1102:                //in this case, normally 0 is returned, but the EXPERIMENTAL 
1103:                //minSubsequenceLength feature allows shorter subsequence matches
1104:                //also to contribute
1105:                if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1106:                    return getReturnValue(n);
1107:                }
1108:                int adr = 0;
1109:                if (m_useRecursionCache) {
1110:                    adr = m_multX * n + m_multY * endIndexS + m_multZ
1111:                            * endIndexT;
1112:                    if (cachekhK[adr % maxCache] == adr + 1)
1113:                        return cachekh[adr % maxCache];
1114:                }
1115:
1116:                //the tail-recursive function defined in [1] is turned into a
1117:                //loop here, preventing stack overflows.
1118:                //loop over s, nearly from the start (skip the first n-1 characters)
1119:                //and only up until endIndexS, and recursively apply K''. Thus, every
1120:                //character between n-1 and endIndexS in s is counted once as 
1121:                //being part of the subsequence match and once just as a gap. 
1122:                //In both cases lambda is multiplied with the result.
1123:                double result = 0;
1124:                /*
1125:                   for (int iS = n-1; iS <= endIndexS;iS++) {
1126:                   result *= m_lambda;
1127:                   result += kernelHelper2(n,s,iS, t, endIndexT);
1128:                   }
1129:                   if (m_useRecursionCache) {
1130:                   cachekhK[adr % maxCache]=adr+1; cachekh[adr % maxCache]=result;
1131:                   }
1132:                   return result;
1133:                 */
1134:                /* ^^^ again, above code segment does not store some intermediate results... */
1135:                result = m_lambda
1136:                        * kernelHelper(n, s, endIndexS - 1, t, endIndexT)
1137:                        + kernelHelper2(n, s, endIndexS, t, endIndexT);
1138:                if (m_useRecursionCache) {
1139:                    cachekhK[adr % maxCache] = adr + 1;
1140:                    cachekh[adr % maxCache] = result;
1141:                }
1142:                return result;
1143:            }
1144:
1145:            /** 
1146:             * helper function for the evaluation of the kernel K'' see section
1147:             * 'Efficient Computation of SSK' in [1]
1148:             *
1149:             * @param n the current length of the matching subsequence
1150:             * @param s first string, as a char array
1151:             * @param t second string, as a char array
1152:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1153:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1154:             * @return a partial result for K'
1155:             */
1156:            protected double kernelHelper2(int n, char[] s, int endIndexS,
1157:                    char[] t, int endIndexT) {
1158:
1159:                //recursion ends if one of the indices in both strings is <0
1160:                if (endIndexS < 0 || endIndexT < 0) {
1161:                    return getReturnValue(n);
1162:                }
1163:
1164:                int adr = 0;
1165:                if (m_useRecursionCache) {
1166:                    adr = m_multX * n + m_multY * endIndexS + m_multZ
1167:                            * endIndexT;
1168:                    if (cachekh2K[adr % maxCache] == adr + 1)
1169:                        return cachekh2[adr % maxCache];
1170:                }
1171:
1172:                //spot the last character in s, we'll need it
1173:                char x = s[endIndexS];
1174:
1175:                //recurse if the last characters of s and t, x (and y) are identical.
1176:                //which is an easy case: just add up two recursions, 
1177:                // 1. one that counts x and y as a part of the subsequence match
1178:                //	 -> n, endIndexS and endIndexT are decremented for next recursion level
1179:                // 	 -> lambda^2 is multiplied with the result to account for the length
1180:                //      of 2 that has been added to the length of the subsequence match
1181:                //      by accepting x and y.
1182:                // 2. one that counts y as a gap in the match 
1183:                //   -> only endIndexT is decremented for next recursion level
1184:                // 	 -> lambda is multiplied with the result to account for the length
1185:                //      of 1 that has been added to the length of the subsequence match
1186:                //		by omitting y.
1187:                if (x == t[endIndexT]) {
1188:                    double ret = m_lambda
1189:                            * (kernelHelper2(n, s, endIndexS, t, endIndexT - 1) + m_lambda
1190:                                    * kernelHelper(n - 1, s, endIndexS - 1, t,
1191:                                            endIndexT - 1));
1192:                    if (m_useRecursionCache) {
1193:                        cachekh2K[adr % maxCache] = adr + 1;
1194:                        cachekh2[adr % maxCache] = ret;
1195:                    }
1196:                    return ret;
1197:                } else {
1198:                    double ret = m_lambda
1199:                            * kernelHelper2(n, s, endIndexS, t, endIndexT - 1);
1200:                    if (m_useRecursionCache) {
1201:                        cachekh2K[adr % maxCache] = adr + 1;
1202:                        cachekh2[adr % maxCache] = ret;
1203:                    }
1204:                    return ret;
1205:                }
1206:
1207:                //look for x in t from back to front. 
1208:                //this is actually an optimization from [1] that spares unneccessary
1209:                //recursions iff
1210:                //x is actually found in t, but not at the last position.
1211:                /*
1212:                   int i;
1213:                   int threshold = n>0?n-1:0;
1214:                   for (i=endIndexT-1; i >= threshold;i--) {
1215:                   if (x == t[i]) {
1216:                   double ret=getPowerOfLambda(endIndexT-i) * kernelHelper2(n,s,endIndexS, t, i);
1217:                   if (m_useRecursionCache) {
1218:                   cachekh2K[adr % maxCache]=adr+1; cachekh2[adr % maxCache]=ret;
1219:                   }
1220:                   return ret;
1221:                   }
1222:                   }
1223:                 */
1224:                //end the recursion if x is not found in t.
1225:                /*        double ret = getReturnValue(n);
1226:                          if (m_useRecursionCache) {
1227:                          cachekh2K[adr % maxCache]=adr+1; cachekh2[adr % maxCache]=ret;
1228:                          }
1229:                          return ret;*/
1230:            }
1231:
1232:            /**
1233:             * the kernel function K explained in [1] using lambda pruning, explained in
1234:             * [2].  An additional parameter is introduced, which denotes the maximum
1235:             * length of a subsequence match. This allows for the control of how relaxed
1236:             * the subsequence matches are. <br>
1237:             *
1238:             * @param n the current length of the matching subsequence
1239:             * @param s first string, as a char array
1240:             * @param t second string, as a char array
1241:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1242:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1243:             * @param remainingMatchLength actually the initial value for
1244:             * maxLambdaExponent
1245:             * @return a double indicating the distance or similarity between s and t, 
1246:             * according to and depending on the initial value for n. 
1247:             */
1248:            protected double kernelLP(int n, char[] s, int endIndexS, char[] t,
1249:                    int endIndexT, int remainingMatchLength) {
1250:                //see code docs in kernel()
1251:                if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1252:                    return getReturnValue(n);
1253:                }
1254:                //lambda pruning check 
1255:                //stops recursion if the match is so long that the resulting
1256:                //power of lambda is smaller than minLambda
1257:                //if lambda pruning is not used, the remainingMatchLength is < 0
1258:                //and this check never stops the recursion
1259:                if (remainingMatchLength == 0)
1260:                    return getReturnValue(n);
1261:                double result = 0;
1262:                //see code docs in kernel()
1263:                for (int iS = endIndexS; iS > n - 2; iS--) {
1264:                    double buf = 0;
1265:                    char x = s[iS];
1266:                    for (int j = 0; j <= endIndexT; j++) {
1267:                        if (t[j] == x) {
1268:                            //both t[j] and x are considered part of the subsequence match, hence
1269:                            //subtract 2 from the remainingMatchLength
1270:                            buf += kernelHelperLP(n - 1, s, iS - 1, t, j - 1,
1271:                                    remainingMatchLength - 2);
1272:                        }
1273:                    }
1274:                    result += buf * m_powersOflambda[2];
1275:                }
1276:                return result;
1277:            }
1278:
1279:            /**
1280:             * helper function for the evaluation of the kernel (K'n) using lambda pruning
1281:             *
1282:             * @param n the current length of the matching subsequence
1283:             * @param s first string, as a char array
1284:             * @param t second string, as a char array
1285:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1286:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1287:             * @param remainingMatchLength the number of characters that may still be
1288:             * used 
1289:             * for matching (i.e. gaps + matches in both strings)
1290:             * @return a partial result for K 
1291:             */
1292:            protected double kernelHelperLP(int n, char[] s, int endIndexS,
1293:                    char[] t, int endIndexT, int remainingMatchLength) {
1294:
1295:                //see code docs in kernelHelper()
1296:                if (n == 0) {
1297:                    return getReturnValue(n);
1298:
1299:                }
1300:                //see code docs in kernelHelper()
1301:                if (Math.min(endIndexS + 1, endIndexT + 1) < n) {
1302:                    ;
1303:                    return getReturnValue(n);
1304:                }
1305:
1306:                //lambda pruning check
1307:                //stops recursion if the match is so long that the resulting
1308:                //power of lambda is smaller than minLambda
1309:                //if lambda pruning is not used, the remainingMatchLength is < 0
1310:                //and this check never stops the recursion
1311:                if (remainingMatchLength < 2 * n) {
1312:                    return getReturnValue(n);
1313:                }
1314:                int adr = 0;
1315:                if (m_useRecursionCache) {
1316:                    adr = m_multX * n + m_multY * endIndexS + m_multZ
1317:                            * endIndexT + m_multZZ * remainingMatchLength;
1318:                    if (cachekh2K[adr % maxCache] == adr + 1) {
1319:                        return cachekh2[adr % maxCache];
1320:                    }
1321:                }
1322:
1323:                int rml = 0; //counts the remaining match length
1324:                double result = 0;
1325:                //see code docs in kernelHelper()
1326:                //difference to implementation in kernelHelper:
1327:                //*)choose different starting point, which is found counting
1328:                //the maximal remaining match length from endIndexS.
1329:                //*)keep track of the remaining match length, rml, which is
1330:                //  incremented each loop
1331:                for (int iS = (endIndexS - remainingMatchLength); iS <= endIndexS; iS++) {
1332:                    result *= m_lambda;
1333:                    result += kernelHelper2LP(n, s, iS, t, endIndexT, rml++);
1334:                }
1335:
1336:                if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1337:                        && n >= 0) {
1338:                    cachekhK[adr % maxCache] = adr + 1;
1339:                    cachekh[adr % maxCache] = result;
1340:                }
1341:                return result;
1342:            }
1343:
1344:            /**
1345:             * helper function for the evaluation of the kernel (K''n) using lambda
1346:             * pruning
1347:             *
1348:             * @param n the current length of the matching subsequence
1349:             * @param s first string, as a char array
1350:             * @param t second string, as a char array
1351:             * @param endIndexS the portion of s currently regarded is s[1:endIndexS]
1352:             * @param endIndexT the portion of t currently regarded is t[1:endIndexT]
1353:             * @param remainingMatchLength the number of characters that may still be
1354:             * used 
1355:             * for matching (i.e. gaps + matches in both strings)
1356:             * @return a partial result for K' 
1357:             */
1358:            protected double kernelHelper2LP(int n, char[] s, int endIndexS,
1359:                    char[] t, int endIndexT, int remainingMatchLength) {
1360:
1361:                //lambda pruning check
1362:                //stops recursion if the match is so long that the resulting
1363:                //power of lambda is smaller than minLambda
1364:                //if lambda pruning is not used, the remainingMatchLength is < 0
1365:                //and this check never stops the recursion
1366:                //if (remainingMatchLength <= 0) return 0;
1367:                if (remainingMatchLength < 2 * n)
1368:                    return getReturnValue(n);
1369:
1370:                //see code docs in kernelHelper2()
1371:                if (endIndexS < 0 || endIndexT < 0)
1372:                    return getReturnValue(n);
1373:                int adr = 0;
1374:                if (m_useRecursionCache) {
1375:                    adr = m_multX * n + m_multY * endIndexS + m_multZ
1376:                            * endIndexT + m_multZZ * remainingMatchLength;
1377:                    if (cachekh2K[adr % maxCache] == adr + 1) {
1378:                        return cachekh2[adr % maxCache];
1379:                    }
1380:                }
1381:
1382:                char x = s[endIndexS];
1383:                if (x == t[endIndexT]) {
1384:                    double ret = m_lambda
1385:                            * (kernelHelper2LP(n, s, endIndexS, t,
1386:                                    endIndexT - 1, remainingMatchLength - 1) + m_lambda
1387:                                    * kernelHelperLP(n - 1, s, endIndexS - 1,
1388:                                            t, endIndexT - 1,
1389:                                            remainingMatchLength - 2));
1390:                    if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1391:                            && n >= 0) {
1392:                        cachekh2K[adr % maxCache] = adr + 1;
1393:                        cachekh2[adr % maxCache] = ret;
1394:                    }
1395:                    return ret;
1396:                }
1397:
1398:                //see code docs in kernelHelper()
1399:                //differences to implementation in kernelHelper():
1400:                //*) choose a different ending point for the loop
1401:                //   based on the remaining match length
1402:                int i;
1403:                int minIndex = endIndexT - remainingMatchLength;
1404:                if (minIndex < 0)
1405:                    minIndex = 0;
1406:                for (i = endIndexT; i >= minIndex; i--) {
1407:                    if (x == t[i]) {
1408:                        int skipLength = endIndexT - i;
1409:                        double ret = getPowerOfLambda(skipLength)
1410:                                * kernelHelper2LP(n, s, endIndexS, t, i,
1411:                                        remainingMatchLength - skipLength);
1412:                        if (m_useRecursionCache && endIndexS >= 0
1413:                                && endIndexT >= 0 && n >= 0) {
1414:                            cachekh2K[adr % maxCache] = adr + 1;
1415:                            cachekh2[adr % maxCache] = ret;
1416:                        }
1417:                        return ret;
1418:                    }
1419:                }
1420:                double ret = getReturnValue(n);
1421:                if (m_useRecursionCache && endIndexS >= 0 && endIndexT >= 0
1422:                        && n >= 0) {
1423:                    cachekh2K[adr % maxCache] = adr + 1;
1424:                    cachekh2[adr % maxCache] = ret;
1425:                }
1426:                return ret;
1427:            }
1428:
1429:            /**
1430:             * precalculates small powers of lambda to speed up the kernel evaluation
1431:             *
1432:             * @return		the powers
1433:             */
1434:            private double[] calculatePowersOfLambda() {
1435:                double[] powers = new double[MAX_POWER_OF_LAMBDA + 1];
1436:                powers[0] = 1.0;
1437:                double val = 1.0;
1438:                for (int i = 1; i <= MAX_POWER_OF_LAMBDA; i++) {
1439:                    val *= m_lambda;
1440:                    powers[i] = val;
1441:                }
1442:                return powers;
1443:            }
1444:
1445:            /**
1446:             * retrieves a power of lambda from the lambda cache or calculates it
1447:             * directly
1448:             *
1449:             * @param exponent	the exponent to calculate
1450:             * @return 		the exponent-th power of lambda
1451:             */
1452:            private double getPowerOfLambda(int exponent) {
1453:                if (exponent > MAX_POWER_OF_LAMBDA)
1454:                    return Math.pow(m_lambda, exponent);
1455:
1456:                if (exponent < 0)
1457:                    throw new IllegalArgumentException(
1458:                            "only positive powers of lambda may be computed");
1459:
1460:                return m_powersOflambda[exponent];
1461:            }
1462:
1463:            /**
1464:             * initializes variables etc.
1465:             * 
1466:             * @param data	the data to use
1467:             */
1468:            protected void initVars(Instances data) {
1469:                super .initVars(data);
1470:
1471:                m_kernelEvals = 0;
1472:                // take the first string attribute
1473:                m_strAttr = -1;
1474:                for (int i = 0; i < data.numAttributes(); i++) {
1475:                    if (i == data.classIndex())
1476:                        continue;
1477:                    if (data.attribute(i).type() == Attribute.STRING) {
1478:                        m_strAttr = i;
1479:                        break;
1480:                    }
1481:                }
1482:                m_numInsts = m_data.numInstances();
1483:                m_storage = new double[m_cacheSize];
1484:                m_keys = new long[m_cacheSize];
1485:                m_powersOflambda = calculatePowersOfLambda();
1486:            }
1487:
1488:            /** 
1489:             * Returns the Capabilities of this kernel.
1490:             *
1491:             * @return            the capabilities of this object
1492:             * @see               Capabilities
1493:             */
1494:            public Capabilities getCapabilities() {
1495:                Capabilities result = super .getCapabilities();
1496:
1497:                result.enable(Capability.STRING_ATTRIBUTES);
1498:                result.enableAllClasses();
1499:
1500:                return result;
1501:            }
1502:
1503:            /**
1504:             * builds the kernel with the given data.
1505:             * 
1506:             * @param data	the data to base the kernel on
1507:             * @throws Exception	if something goes wrong, e.g., the data does not
1508:             * 			consist of one string attribute and the class
1509:             */
1510:            public void buildKernel(Instances data) throws Exception {
1511:                super.buildKernel(data);
1512:            }
1513:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.