001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.solr.handler.admin;
017:
018: import java.io.IOException;
019: import java.net.MalformedURLException;
020: import java.net.URL;
021: import java.util.Collection;
022: import java.util.Date;
023: import java.util.HashMap;
024: import java.util.HashSet;
025: import java.util.LinkedList;
026: import java.util.List;
027: import java.util.Map;
028: import java.util.Set;
029: import java.util.logging.Level;
030: import java.util.logging.Logger;
031:
032: import org.apache.lucene.document.Document;
033: import org.apache.lucene.document.Fieldable;
034: import org.apache.lucene.index.IndexReader;
035: import org.apache.lucene.index.Term;
036: import org.apache.lucene.index.TermEnum;
037: import org.apache.lucene.index.TermFreqVector;
038: import org.apache.lucene.search.MatchAllDocsQuery;
039: import org.apache.lucene.search.Query;
040: import org.apache.lucene.search.Sort;
041: import org.apache.lucene.store.Directory;
042: import org.apache.lucene.util.PriorityQueue;
043: import org.apache.solr.core.SolrException;
044: import org.apache.solr.handler.RequestHandlerBase;
045: import org.apache.solr.handler.RequestHandlerUtils;
046: import org.apache.solr.request.SolrParams;
047: import org.apache.solr.request.SolrQueryRequest;
048: import org.apache.solr.request.SolrQueryResponse;
049: import org.apache.solr.schema.FieldType;
050: import org.apache.solr.schema.IndexSchema;
051: import org.apache.solr.schema.SchemaField;
052: import org.apache.solr.search.DocList;
053: import org.apache.solr.search.SolrIndexSearcher;
054: import org.apache.solr.search.SolrQueryParser;
055: import org.apache.solr.util.NamedList;
056: import org.apache.solr.util.SimpleOrderedMap;
057:
058: /**
059: * This handler exposes the internal lucene index. It is inspired by and
060: * modeled on Luke, the Lucene Index Browser by Andrzej Bialecki.
061: * http://www.getopt.org/luke/
062: * <p>
063: * NOTE: the response format is still likely to change. It should be designed so
064: * that it works nicely with an XSLT transformation. Until we have a nice
065: * XSLT front end for /admin, the format is still open to change.
066: * </p>
067: *
068: * For more documentation see:
069: * http://wiki.apache.org/solr/LukeRequestHandler
070: *
071: * @author ryan
072: * @version $Id: LukeRequestHandler.java 542679 2007-05-29 22:28:21Z ryan $
073: * @since solr 1.2
074: */
075: public class LukeRequestHandler extends RequestHandlerBase {
076: private static Logger log = Logger
077: .getLogger(LukeRequestHandler.class.getName());
078:
079: public static final String NUMTERMS = "numTerms";
080: public static final String DOC_ID = "docId";
081: public static final String ID = "id";
082: public static final int DEFAULT_COUNT = 10;
083:
084: @Override
085: public void handleRequestBody(SolrQueryRequest req,
086: SolrQueryResponse rsp) throws Exception {
087: RequestHandlerUtils.addExperimentalFormatWarning(rsp);
088:
089: IndexSchema schema = req.getSchema();
090: SolrIndexSearcher searcher = req.getSearcher();
091: IndexReader reader = searcher.getReader();
092: SolrParams params = req.getParams();
093: int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);
094:
095: // Always show the core lucene info
096: rsp.add("index", getIndexInfo(reader, numTerms > 0));
097:
098: Integer docId = params.getInt(DOC_ID);
099: if (docId == null && params.get(ID) != null) {
100: // Look for something with a given solr ID
101: SchemaField uniqueKey = schema.getUniqueKeyField();
102: String v = uniqueKey.getType().toInternal(params.get(ID));
103: Term t = new Term(uniqueKey.getName(), v);
104: docId = searcher.getFirstMatch(t);
105: if (docId < 0) {
106: throw new SolrException(
107: SolrException.ErrorCode.NOT_FOUND,
108: "Can't find document: " + params.get(ID));
109: }
110: }
111:
112: // Read the document from the index
113: if (docId != null) {
114: Document doc = null;
115: try {
116: doc = reader.document(docId);
117: } catch (Exception ex) {
118: }
119: if (doc == null) {
120: throw new SolrException(
121: SolrException.ErrorCode.NOT_FOUND,
122: "Can't find document: " + docId);
123: }
124:
125: SimpleOrderedMap<Object> info = getDocumentFieldsInfo(doc,
126: docId, reader, schema);
127:
128: SimpleOrderedMap<Object> docinfo = new SimpleOrderedMap<Object>();
129: docinfo.add("docId", docId);
130: docinfo.add("lucene", info);
131: docinfo.add("solr", doc);
132: rsp.add("doc", docinfo);
133: } else {
134: // If no doc is given, show all fields and top terms
135: Set<String> fields = null;
136: if (params.get(SolrParams.FL) != null) {
137: fields = new HashSet<String>();
138: for (String f : params.getParams(SolrParams.FL)) {
139: fields.add(f);
140: }
141: }
142: rsp.add("fields", getIndexedFieldsInfo(searcher, fields,
143: numTerms));
144: }
145:
146: // Add some generally helpful information
147: NamedList<Object> info = new SimpleOrderedMap<Object>();
148: info.add("key", getFieldFlagsKey());
149: info
150: .add(
151: "NOTE",
152: "Document Frequency (df) is not updated when a document is marked for deletion. df values include deleted documents.");
153: rsp.add("info", info);
154: }
155:
156: /**
157: * @return a string representing a Fieldable's flags.
158: */
159: private static String getFieldFlags(Fieldable f) {
160: StringBuilder flags = new StringBuilder();
161: flags.append((f != null && f.isIndexed()) ? 'I' : '-');
162: flags.append((f != null && f.isTokenized()) ? 'T' : '-');
163: flags.append((f != null && f.isStored()) ? 'S' : '-');
164: flags.append((false) ? 'M' : '-'); // SchemaField Specific
165: flags.append((f != null && f.isTermVectorStored()) ? 'V' : '-');
166: flags
167: .append((f != null && f.isStoreOffsetWithTermVector()) ? 'o'
168: : '-');
169: flags
170: .append((f != null && f.isStorePositionWithTermVector()) ? 'p'
171: : '-');
172: flags.append((f != null && f.getOmitNorms()) ? 'O' : '-');
173: flags.append((f != null && f.isLazy()) ? 'L' : '-');
174: flags.append((f != null && f.isBinary()) ? 'B' : '-');
175: flags.append((f != null && f.isCompressed()) ? 'C' : '-');
176: flags.append((false) ? 'f' : '-'); // SchemaField Specific
177: flags.append((false) ? 'l' : '-'); // SchemaField Specific
178: return flags.toString();
179: }
180:
181: /**
182: * @return a string representing a SchemaField's flags.
183: */
184: private static String getFieldFlags(SchemaField f) {
185: FieldType t = (f == null) ? null : f.getType();
186:
187: // see: http://www.nabble.com/schema-field-properties-tf3437753.html#a9585549
188: boolean lazy = false; // "lazy" is purely a property of reading fields
189: boolean binary = false; // Currently not possible
190:
191: StringBuilder flags = new StringBuilder();
192: flags.append((f != null && f.indexed()) ? 'I' : '-');
193: flags.append((t != null && t.isTokenized()) ? 'T' : '-');
194: flags.append((f != null && f.stored()) ? 'S' : '-');
195: flags.append((f != null && f.multiValued()) ? 'M' : '-');
196: flags.append((f != null && f.storeTermVector()) ? 'V' : '-');
197: flags.append((f != null && f.storeTermOffsets()) ? 'o' : '-');
198: flags.append((f != null && f.storeTermPositions()) ? 'p' : '-');
199: flags.append((f != null && f.omitNorms()) ? 'O' : '-');
200: flags.append((lazy) ? 'L' : '-');
201: flags.append((binary) ? 'B' : '-');
202: flags.append((f != null && f.isCompressed()) ? 'C' : '-');
203: flags.append((f != null && f.sortMissingFirst()) ? 'f' : '-');
204: flags.append((f != null && f.sortMissingLast()) ? 'l' : '-');
205: return flags.toString();
206: }
207:
208: /**
209: * @return a key to what each character means
210: */
211: private static SimpleOrderedMap<String> getFieldFlagsKey() {
212: SimpleOrderedMap<String> key = new SimpleOrderedMap<String>();
213: key.add("I", "Indexed");
214: key.add("T", "Tokenized");
215: key.add("S", "Stored");
216: key.add("M", "Multivalued");
217: key.add("V", "TermVector Stored");
218: key.add("o", "Store Offset With TermVector");
219: key.add("p", "Store Position With TermVector");
220: key.add("O", "Omit Norms");
221: key.add("L", "Lazy");
222: key.add("B", "Binary");
223: key.add("C", "Compressed");
224: key.add("f", "Sort Missing First");
225: key.add("l", "Sort Missing Last");
226: return key;
227: }
228:
229: private static SimpleOrderedMap<Object> getDocumentFieldsInfo(
230: Document doc, int docId, IndexReader reader,
231: IndexSchema schema) throws IOException {
232: SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
233: for (Object o : doc.getFields()) {
234: Fieldable fieldable = (Fieldable) o;
235: SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();
236:
237: SchemaField sfield = schema
238: .getFieldOrNull(fieldable.name());
239: FieldType ftype = (sfield == null) ? null : sfield
240: .getType();
241:
242: f.add("type", (ftype == null) ? null : ftype.getTypeName());
243: f.add("schema", getFieldFlags(sfield));
244: f.add("flags", getFieldFlags(fieldable));
245:
246: Term t = new Term(fieldable.name(), fieldable.stringValue());
247: f.add("value", (ftype == null) ? null : ftype
248: .toExternal(fieldable));
249: f.add("internal", fieldable.stringValue()); // may be a binary number
250: f.add("boost", fieldable.getBoost());
251: f.add("docFreq", reader.docFreq(t)); // this can be 0 for non-indexed fields
252:
253: // If we have a term vector, return that
254: if (fieldable.isTermVectorStored()) {
255: try {
256: TermFreqVector v = reader.getTermFreqVector(docId,
257: fieldable.name());
258: if (v != null) {
259: SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
260: for (int i = 0; i < v.size(); i++) {
261: tfv.add(v.getTerms()[i], v
262: .getTermFrequencies()[i]);
263: }
264: f.add("termVector", tfv);
265: }
266: } catch (Exception ex) {
267: log.log(Level.WARNING, "error writing term vector",
268: ex);
269: }
270: }
271:
272: finfo.add(fieldable.name(), f);
273: }
274: return finfo;
275: }
276:
277: @SuppressWarnings("unchecked")
278: private static SimpleOrderedMap<Object> getIndexedFieldsInfo(
279: final SolrIndexSearcher searcher, final Set<String> fields,
280: final int numTerms) throws Exception {
281: Query matchAllDocs = new MatchAllDocsQuery();
282: SolrQueryParser qp = searcher.getSchema().getSolrQueryParser(
283: null);
284:
285: IndexReader reader = searcher.getReader();
286: IndexSchema schema = searcher.getSchema();
287:
288: // Walk the term enum and keep a priority queue for each map in our set
289: Map<String, TopTermQueue> ttinfo = null;
290: if (numTerms > 0) {
291: ttinfo = getTopTerms(reader, fields, numTerms, null);
292: }
293: SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
294: Collection<String> fieldNames = reader
295: .getFieldNames(IndexReader.FieldOption.ALL);
296: for (String fieldName : fieldNames) {
297: if (fields != null && !fields.contains(fieldName)) {
298: continue; // if a field is specified, only them
299: }
300:
301: SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();
302:
303: SchemaField sfield = schema.getFieldOrNull(fieldName);
304: FieldType ftype = (sfield == null) ? null : sfield
305: .getType();
306:
307: f.add("type", (ftype == null) ? null : ftype.getTypeName());
308: f.add("schema", getFieldFlags(sfield));
309:
310: // If numTerms==0, the call is just asking for a quick field list
311: if (ttinfo != null && sfield != null && sfield.indexed()) {
312: Query q = qp.parse(fieldName + ":[* TO *]");
313: int docCount = searcher.numDocs(q, matchAllDocs);
314: if (docCount > 0) {
315: // Find a document with this field
316: DocList ds = searcher.getDocList(q, (Query) null,
317: (Sort) null, 0, 1);
318: try {
319: Document doc = searcher.doc(ds.iterator()
320: .next());
321: Fieldable fld = doc.getFieldable(fieldName);
322: if (fld != null) {
323: f.add("index", getFieldFlags(fld));
324: } else {
325: // it is a non-stored field...
326: f.add("index", "(unstored field)");
327: }
328: } catch (Exception ex) {
329: log
330: .warning("error reading field: "
331: + fieldName);
332: }
333: // Find one document so we can get the fieldable
334: }
335: f.add("docs", docCount);
336:
337: TopTermQueue topTerms = ttinfo.get(fieldName);
338: if (topTerms != null) {
339: f.add("distinct", topTerms.distinctTerms);
340:
341: // Include top terms
342: f.add("topTerms", topTerms.toNamedList(searcher
343: .getSchema()));
344:
345: // Add a histogram
346: f
347: .add("histogram", topTerms.histogram
348: .toNamedList());
349: }
350: }
351:
352: // Add the field
353: finfo.add(fieldName, f);
354: }
355: return finfo;
356: }
357:
358: private static SimpleOrderedMap<Object> getIndexInfo(
359: IndexReader reader, boolean countTerms) throws IOException {
360: Directory dir = reader.directory();
361: SimpleOrderedMap<Object> indexInfo = new SimpleOrderedMap<Object>();
362: indexInfo.add("numDocs", reader.numDocs());
363: indexInfo.add("maxDoc", reader.maxDoc());
364:
365: if (countTerms) {
366: TermEnum te = reader.terms();
367: int numTerms = 0;
368: while (te.next()) {
369: numTerms++;
370: }
371: indexInfo.add("numTerms", numTerms);
372: }
373:
374: indexInfo.add("version", reader.getVersion()); // TODO? Is this different then: IndexReader.getCurrentVersion( dir )?
375: indexInfo.add("optimized", reader.isOptimized());
376: indexInfo.add("current", reader.isCurrent());
377: indexInfo.add("hasDeletions", reader.hasDeletions());
378: indexInfo.add("directory", dir);
379: indexInfo.add("lastModified", new Date(IndexReader
380: .lastModified(dir)));
381: return indexInfo;
382: }
383:
384: //////////////////////// SolrInfoMBeans methods //////////////////////
385:
386: @Override
387: public String getDescription() {
388: return "Lucene Index Browser. Inspired and modeled after Luke: http://www.getopt.org/luke/";
389: }
390:
391: @Override
392: public String getVersion() {
393: return "$Revision: 542679 $";
394: }
395:
396: @Override
397: public String getSourceId() {
398: return "$Id: LukeRequestHandler.java 542679 2007-05-29 22:28:21Z ryan $";
399: }
400:
401: @Override
402: public String getSource() {
403: return "$URL: https://svn.apache.org/repos/asf/lucene/solr/branches/branch-1.2/src/java/org/apache/solr/handler/admin/LukeRequestHandler.java $";
404: }
405:
406: @Override
407: public URL[] getDocs() {
408: try {
409: return new URL[] { new URL(
410: "http://wiki.apache.org/solr/LukeRequestHandler") };
411: } catch (MalformedURLException ex) {
412: return null;
413: }
414: }
415:
416: ///////////////////////////////////////////////////////////////////////////////////////
417:
418: private static class TermHistogram {
419: int maxBucket = -1;
420: public Map<Integer, Integer> hist = new HashMap<Integer, Integer>();
421:
422: private static final double LOG2 = Math.log(2);
423:
424: public static int getPowerOfTwoBucket(int num) {
425: int exp = (int) Math.ceil((Math.log(num) / LOG2));
426: return (int) Math.pow(2, exp);
427: }
428:
429: public void add(int df) {
430: Integer bucket = getPowerOfTwoBucket(df);
431: if (bucket > maxBucket) {
432: maxBucket = bucket;
433: }
434: Integer old = hist.get(bucket);
435: if (old == null) {
436: hist.put(bucket, 1);
437: } else {
438: hist.put(bucket, old + 1);
439: }
440: }
441:
442: // TODO? should this be a list or a map?
443: public NamedList<Integer> toNamedList() {
444: NamedList<Integer> nl = new NamedList<Integer>();
445: for (int bucket = 2; bucket <= maxBucket; bucket *= 2) {
446: Integer val = hist.get(bucket);
447: if (val == null) {
448: val = 0;
449: }
450: nl.add("" + bucket, val);
451: }
452: return nl;
453: }
454: }
455:
456: /**
457: * Private internal class that counts up frequent terms
458: */
459: private static class TopTermQueue extends PriorityQueue {
460: static class TermInfo {
461: TermInfo(Term t, int df) {
462: term = t;
463: docFreq = df;
464: }
465:
466: int docFreq;
467: Term term;
468: }
469:
470: public int minFreq = 0;
471: public int distinctTerms = 0;
472: public TermHistogram histogram;
473:
474: TopTermQueue(int size) {
475: initialize(size);
476: histogram = new TermHistogram();
477: }
478:
479: @Override
480: protected final boolean lessThan(Object a, Object b) {
481: TermInfo termInfoA = (TermInfo) a;
482: TermInfo termInfoB = (TermInfo) b;
483: return termInfoA.docFreq < termInfoB.docFreq;
484: }
485:
486: /**
487: * This is a destructive call... the queue is empty at the end
488: */
489: public NamedList<Integer> toNamedList(IndexSchema schema) {
490: // reverse the list..
491: List<TermInfo> aslist = new LinkedList<TermInfo>();
492: while (size() > 0) {
493: aslist.add(0, (TermInfo) pop());
494: }
495:
496: NamedList<Integer> list = new NamedList<Integer>();
497: for (TermInfo i : aslist) {
498: String txt = i.term.text();
499: SchemaField ft = schema.getFieldOrNull(i.term.field());
500: if (ft != null) {
501: txt = ft.getType().indexedToReadable(txt);
502: }
503: list.add(txt, i.docFreq);
504: }
505: return list;
506: }
507: }
508:
509: private static Map<String, TopTermQueue> getTopTerms(
510: IndexReader reader, Set<String> fields, int numTerms,
511: Set<String> junkWords) throws Exception {
512: Map<String, TopTermQueue> info = new HashMap<String, TopTermQueue>();
513: TermEnum terms = reader.terms();
514:
515: while (terms.next()) {
516: String field = terms.term().field();
517: String t = terms.term().text();
518:
519: // Compute distinct terms for every field
520: TopTermQueue tiq = info.get(field);
521: if (tiq == null) {
522: tiq = new TopTermQueue(numTerms);
523: info.put(field, tiq);
524: }
525: tiq.distinctTerms++;
526: tiq.histogram.add(terms.docFreq()); // add the term to the histogram
527:
528: // Only save the distinct terms for fields we worry about
529: if (fields != null && fields.size() > 0) {
530: if (!fields.contains(field)) {
531: continue;
532: }
533: }
534: if (junkWords != null && junkWords.contains(t)) {
535: continue;
536: }
537:
538: if (terms.docFreq() > tiq.minFreq) {
539: tiq.put(new TopTermQueue.TermInfo(terms.term(), terms
540: .docFreq()));
541: if (tiq.size() >= numTerms) { // if tiq full
542: tiq.pop(); // remove lowest in tiq
543: tiq.minFreq = ((TopTermQueue.TermInfo) tiq.top()).docFreq; // reset minFreq
544: }
545: }
546: }
547: return info;
548: }
549: }
|