001: /*
002: * Created on 25-Jan-2006
003: */
004: package org.apache.lucene.xmlparser.builders;
005:
006: import java.io.IOException;
007: import java.io.StringReader;
008: import java.util.HashSet;
009: import java.util.Set;
010:
011: import org.apache.lucene.analysis.Analyzer;
012: import org.apache.lucene.analysis.Token;
013: import org.apache.lucene.analysis.TokenStream;
014: import org.apache.lucene.search.similar.MoreLikeThisQuery;
015: import org.apache.lucene.search.Query;
016: import org.apache.lucene.xmlparser.DOMUtils;
017: import org.apache.lucene.xmlparser.ParserException;
018: import org.apache.lucene.xmlparser.QueryBuilder;
019: import org.w3c.dom.Element;
020:
021: /**
022: * Licensed to the Apache Software Foundation (ASF) under one or more
023: * contributor license agreements. See the NOTICE file distributed with
024: * this work for additional information regarding copyright ownership.
025: * The ASF licenses this file to You under the Apache License, Version 2.0
026: * (the "License"); you may not use this file except in compliance with
027: * the License. You may obtain a copy of the License at
028: *
029: * http://www.apache.org/licenses/LICENSE-2.0
030: *
031: * Unless required by applicable law or agreed to in writing, software
032: * distributed under the License is distributed on an "AS IS" BASIS,
033: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
034: * See the License for the specific language governing permissions and
035: * limitations under the License.
036: */
037:
038: /**
039: * @author maharwood
040: */
041: public class LikeThisQueryBuilder implements QueryBuilder {
042:
043: private Analyzer analyzer;
044: String defaultFieldNames[];
045: int defaultMaxQueryTerms = 20;
046: int defaultMinTermFrequency = 1;
047: float defaultPercentTermsToMatch = 30; //default is a 3rd of selected terms must match
048:
049: public LikeThisQueryBuilder(Analyzer analyzer,
050: String[] defaultFieldNames) {
051: this .analyzer = analyzer;
052: this .defaultFieldNames = defaultFieldNames;
053: }
054:
055: /* (non-Javadoc)
056: * @see org.apache.lucene.xmlparser.QueryObjectBuilder#process(org.w3c.dom.Element)
057: */
058: public Query getQuery(Element e) throws ParserException {
059: String fieldsList = e.getAttribute("fieldNames"); //a comma-delimited list of fields
060: String fields[] = defaultFieldNames;
061: if ((fieldsList != null) && (fieldsList.trim().length() > 0)) {
062: fields = fieldsList.trim().split(",");
063: //trim the fieldnames
064: for (int i = 0; i < fields.length; i++) {
065: fields[i] = fields[i].trim();
066: }
067: }
068:
069: //Parse any "stopWords" attribute
070: //TODO MoreLikeThis needs to ideally have per-field stopWords lists - until then
071: //I use all analyzers/fields to generate multi-field compatible stop list
072: String stopWords = e.getAttribute("stopWords");
073: Set stopWordsSet = null;
074: if ((stopWords != null) && (fields != null)) {
075: stopWordsSet = new HashSet();
076: for (int i = 0; i < fields.length; i++) {
077: TokenStream ts = analyzer.tokenStream(fields[i],
078: new StringReader(stopWords));
079: try {
080: Token stopToken = ts.next();
081: while (stopToken != null) {
082: stopWordsSet.add(stopToken.termText());
083: stopToken = ts.next();
084: }
085: } catch (IOException ioe) {
086: throw new ParserException(
087: "IoException parsing stop words list in "
088: + getClass().getName() + ":"
089: + ioe.getLocalizedMessage());
090: }
091: }
092: }
093:
094: MoreLikeThisQuery mlt = new MoreLikeThisQuery(DOMUtils
095: .getText(e), fields, analyzer);
096: mlt.setMaxQueryTerms(DOMUtils.getAttribute(e, "maxQueryTerms",
097: defaultMaxQueryTerms));
098: mlt.setMinTermFrequency(DOMUtils.getAttribute(e,
099: "minTermFrequency", defaultMinTermFrequency));
100: mlt
101: .setPercentTermsToMatch(DOMUtils.getAttribute(e,
102: "percentTermsToMatch",
103: defaultPercentTermsToMatch) / 100);
104: mlt.setStopWords(stopWordsSet);
105: int minDocFreq = DOMUtils.getAttribute(e, "minDocFreq", -1);
106: if (minDocFreq >= 0) {
107: mlt.setMinDocFreq(minDocFreq);
108: }
109:
110: mlt.setBoost(DOMUtils.getAttribute(e, "boost", 1.0f));
111:
112: return mlt;
113: }
114:
115: }
|