001: /**
002: * Licensed to the Apache Software Foundation (ASF) under one or more
003: * contributor license agreements. See the NOTICE file distributed with
004: * this work for additional information regarding copyright ownership.
005: * The ASF licenses this file to You under the Apache License, Version 2.0
006: * (the "License"); you may not use this file except in compliance with
007: * the License. You may obtain a copy of the License at
008: *
009: * http://www.apache.org/licenses/LICENSE-2.0
010: *
011: * Unless required by applicable law or agreed to in writing, software
012: * distributed under the License is distributed on an "AS IS" BASIS,
013: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014: * See the License for the specific language governing permissions and
015: * limitations under the License.
016: */package org.apache.lucene.benchmark.quality.trec;
017:
018: import java.io.BufferedReader;
019: import java.io.IOException;
020: import java.util.ArrayList;
021: import java.util.Arrays;
022: import java.util.HashMap;
023:
024: import org.apache.lucene.benchmark.quality.QualityQuery;
025:
026: /**
027: * Read TREC topics.
028: * <p>
029: * Expects this topic format -
030: * <pre>
031: * <top>
032: * <num> Number: nnn
033: *
034: * <title> title of the topic
035: *
036: * <desc> Description:
037: * description of the topic
038: *
039: * <narr> Narrative:
040: * "story" composed by assessors.
041: *
042: * </top>
043: * </pre>
044: * Comment lines starting with '#' are ignored.
045: */
046: public class TrecTopicsReader {
047:
048: private static final String newline = System
049: .getProperty("line.separator");
050:
051: /**
052: * Constructor for Trec's TopicsReader
053: */
054: public TrecTopicsReader() {
055: super ();
056: }
057:
058: /**
059: * Read quality queries from trec format topics file.
060: * @param reader where queries are read from.
061: * @return the result quality queries.
062: * @throws IOException if cannot read the queries.
063: */
064: public QualityQuery[] readQueries(BufferedReader reader)
065: throws IOException {
066: ArrayList res = new ArrayList();
067: StringBuffer sb;
068: try {
069: while (null != (sb = read(reader, "<top>", null, false,
070: false))) {
071: HashMap fields = new HashMap();
072: // id
073: sb = read(reader, "<num>", null, true, false);
074: int k = sb.indexOf(":");
075: String id = sb.substring(k + 1).trim();
076: // title
077: sb = read(reader, "<title>", null, true, false);
078: k = sb.indexOf(">");
079: String title = sb.substring(k + 1).trim();
080: // description
081: sb = read(reader, "<desc>", null, false, false);
082: sb = read(reader, "<narr>", null, false, true);
083: String descripion = sb.toString().trim();
084: // we got a topic!
085: fields.put("title", title);
086: fields.put("description", descripion);
087: QualityQuery topic = new QualityQuery(id, fields);
088: res.add(topic);
089: // skip narrative, get to end of doc
090: read(reader, "</top>", null, false, false);
091: }
092: } finally {
093: reader.close();
094: }
095: // sort result array (by ID)
096: QualityQuery qq[] = (QualityQuery[]) res
097: .toArray(new QualityQuery[0]);
098: Arrays.sort(qq);
099: return qq;
100: }
101:
102: // read until finding a line that starts with the specified prefix
103: private StringBuffer read(BufferedReader reader, String prefix,
104: StringBuffer sb, boolean collectMatchLine,
105: boolean collectAll) throws IOException {
106: sb = (sb == null ? new StringBuffer() : sb);
107: String sep = "";
108: while (true) {
109: String line = reader.readLine();
110: if (line == null) {
111: return null;
112: }
113: if (line.startsWith(prefix)) {
114: if (collectMatchLine) {
115: sb.append(sep + line);
116: sep = newline;
117: }
118: break;
119: }
120: if (collectAll) {
121: sb.append(sep + line);
122: sep = newline;
123: }
124: }
125: //System.out.println("read: "+sb);
126: return sb;
127: }
128: }
|