001: package org.apache.lucene.search;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Analyzer;
021: import org.apache.lucene.analysis.Token;
022: import org.apache.lucene.analysis.TokenStream;
023: import org.apache.lucene.index.TermFreqVector;
024:
025: import java.io.IOException;
026: import java.io.StringReader;
027: import java.util.*;
028:
029: /**
030: *
031: *
032: **/
033: public class QueryTermVector implements TermFreqVector {
034: private String[] terms = new String[0];
035: private int[] termFreqs = new int[0];
036:
037: public String getField() {
038: return null;
039: }
040:
041: /**
042: *
043: * @param queryTerms The original list of terms from the query, can contain duplicates
044: */
045: public QueryTermVector(String[] queryTerms) {
046:
047: processTerms(queryTerms);
048: }
049:
050: public QueryTermVector(String queryString, Analyzer analyzer) {
051: if (analyzer != null) {
052: TokenStream stream = analyzer.tokenStream("",
053: new StringReader(queryString));
054: if (stream != null) {
055: Token next = null;
056: List terms = new ArrayList();
057: try {
058: while ((next = stream.next()) != null) {
059: terms.add(next.termText());
060: }
061: processTerms((String[]) terms
062: .toArray(new String[terms.size()]));
063: } catch (IOException e) {
064: }
065: }
066: }
067: }
068:
069: private void processTerms(String[] queryTerms) {
070: if (queryTerms != null) {
071: Arrays.sort(queryTerms);
072: Map tmpSet = new HashMap(queryTerms.length);
073: //filter out duplicates
074: List tmpList = new ArrayList(queryTerms.length);
075: List tmpFreqs = new ArrayList(queryTerms.length);
076: int j = 0;
077: for (int i = 0; i < queryTerms.length; i++) {
078: String term = queryTerms[i];
079: Integer position = (Integer) tmpSet.get(term);
080: if (position == null) {
081: tmpSet.put(term, new Integer(j++));
082: tmpList.add(term);
083: tmpFreqs.add(new Integer(1));
084: } else {
085: Integer integer = (Integer) tmpFreqs.get(position
086: .intValue());
087: tmpFreqs.set(position.intValue(), new Integer(
088: integer.intValue() + 1));
089: }
090: }
091: terms = (String[]) tmpList.toArray(terms);
092: //termFreqs = (int[])tmpFreqs.toArray(termFreqs);
093: termFreqs = new int[tmpFreqs.size()];
094: int i = 0;
095: for (Iterator iter = tmpFreqs.iterator(); iter.hasNext();) {
096: Integer integer = (Integer) iter.next();
097: termFreqs[i++] = integer.intValue();
098: }
099: }
100: }
101:
102: public final String toString() {
103: StringBuffer sb = new StringBuffer();
104: sb.append('{');
105: for (int i = 0; i < terms.length; i++) {
106: if (i > 0)
107: sb.append(", ");
108: sb.append(terms[i]).append('/').append(termFreqs[i]);
109: }
110: sb.append('}');
111: return sb.toString();
112: }
113:
114: public int size() {
115: return terms.length;
116: }
117:
118: public String[] getTerms() {
119: return terms;
120: }
121:
122: public int[] getTermFrequencies() {
123: return termFreqs;
124: }
125:
126: public int indexOf(String term) {
127: int res = Arrays.binarySearch(terms, term);
128: return res >= 0 ? res : -1;
129: }
130:
131: public int[] indexesOf(String[] terms, int start, int len) {
132: int res[] = new int[len];
133:
134: for (int i = 0; i < len; i++) {
135: res[i] = indexOf(terms[i]);
136: }
137: return res;
138: }
139:
140: }
|