/*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using AnalyzerLucene.Net.Analysis.Analyzer;
using TokenLucene.Net.Analysis.Token;
using TokenStreamLucene.Net.Analysis.TokenStream;
using DocumentLucene.Net.Documents.Document;
using FieldLucene.Net.Documents.Field;
using SimilarityLucene.Net.Search.Similarity;
using DirectoryLucene.Net.Store.Directory;
using IndexOutputLucene.Net.Store.IndexOutput;
namespace Lucene.Net.Index{
public sealed class DocumentWriter
{
private void InitBlock()
{
termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
}
private Analyzer analyzer;
private Directory directory;
private Similarity similarity;
private FieldInfos fieldInfos;
private int maxFieldLength;
private int termIndexInterval;
private System.IO.TextWriter infoStream;
/// <summary>This ctor used by test code only.
///
/// </summary>
/// <param name="directory">The directory to write the document information to
/// </param>
/// <param name="analyzer">The analyzer to use for the document
/// </param>
/// <param name="similarity">The Similarity function
/// </param>
/// <param name="maxFieldLength">The maximum number of tokens a field may have
/// </param>
internal DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
{
InitBlock();
this.directory = directory;
this.analyzer = analyzer;
this.similarity = similarity;
this.maxFieldLength = maxFieldLength;
}
internal DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
{
InitBlock();
this.directory = directory;
this.analyzer = analyzer;
this.similarity = writer.GetSimilarity();
this.maxFieldLength = writer.GetMaxFieldLength();
this.termIndexInterval = writer.GetTermIndexInterval();
}
public /*internal*/ void AddDocument(System.String segment, Document doc)
{
// write field names
fieldInfos = new FieldInfos();
fieldInfos.Add(doc);
fieldInfos.Write(directory, segment + ".fnm");
// write field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
try
{
fieldsWriter.AddDocument(doc);
}
finally
{
fieldsWriter.Close();
}
// invert doc into postingTable
postingTable.Clear(); // clear postingTable
fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.Size()]; // init fieldOffsets
fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
float boost = doc.GetBoost();
for (int i = 0; i < fieldBoosts.Length; i++)
{
fieldBoosts[i] = boost;
}
InvertDocument(doc);
// sort postingTable into an array
Posting[] postings = SortPostingTable();
/*
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
System.out.print(posting.term);
System.out.print(" freq=" + posting.freq);
System.out.print(" pos=");
System.out.print(posting.positions[0]);
for (int j = 1; j < posting.freq; j++)
System.out.print("," + posting.positions[j]);
System.out.println("");
}
*/
// write postings
WritePostings(postings, segment);
// write norms of indexed fields
WriteNorms(segment);
}
// Keys are Terms, values are Postings.
// Used to buffer a document before it is written to the index.
private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
private int[] fieldLengths;
private int[] fieldPositions;
private int[] fieldOffsets;
private float[] fieldBoosts;
// Tokenizes the fields of a document into Postings.
private void InvertDocument(Document doc)
{
System.Collections.IEnumerator fields = doc.Fields();
while (fields.MoveNext())
{
Field field = (Field) fields.Current;
System.String fieldName = field.Name();
int fieldNumber = fieldInfos.FieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of field
int position = fieldPositions[fieldNumber]; // position in field
if (length > 0)
position += analyzer.GetPositionIncrementGap(fieldName);
int offset = fieldOffsets[fieldNumber]; // offset field
if (field.IsIndexed())
{
if (!field.IsTokenized())
{
// un-tokenized field
System.String stringValue = field.StringValue();
if (field.IsStoreOffsetWithTermVector())
AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
else
AddPosition(fieldName, stringValue, position++, null);
offset += stringValue.Length;
length++;
}
else
{
System.IO.TextReader reader; // find or make Reader
if (field.ReaderValue() != null)
reader = field.ReaderValue();
else if (field.StringValue() != null)
reader = new System.IO.StringReader(field.StringValue());
else
throw new System.ArgumentException("field must have either String or Reader value");
// Tokenize field and add to postingTable
TokenStream stream = analyzer.TokenStream(fieldName, reader);
try
{
Token lastToken = null;
for (Token t = stream.Next(); t != null; t = stream.Next())
{
position += (t.GetPositionIncrement() - 1);
if (field.IsStoreOffsetWithTermVector())
AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
else
AddPosition(fieldName, t.TermText(), position++, null);
lastToken = t;
if (++length > maxFieldLength)
{
if (infoStream != null)
infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
break;
}
}
if (lastToken != null)
offset += lastToken.EndOffset() + 1;
}
finally
{
stream.Close();
}
}
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field.GetBoost();
fieldOffsets[fieldNumber] = offset;
}
}
}
private Term termBuffer = new Term("", ""); // avoid consing
private void AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
{
termBuffer.Set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable[termBuffer];
if (ti != null)
{
// word seen before
int freq = ti.freq;
if (ti.positions.Length == freq)
{
// positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
for (int i = 0; i < freq; i++)
// copy old positions to new
newPositions[i] = positions[i];
ti.positions = newPositions;
}
ti.positions[freq] = position; // add new position
if (offset != null)
{
if (ti.offsets.Length == freq)
{
TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
TermVectorOffsetInfo[] offsets = ti.offsets;
for (int i = 0; i < freq; i++)
{
newOffsets[i] = offsets[i];
}
ti.offsets = newOffsets;
}
ti.offsets[freq] = offset;
}
ti.freq = freq + 1; // update frequency
}
else
{
// word not seen before
Term term = new Term(field, text, false);
postingTable[term] = new Posting(term, position, offset);
}
}
private Posting[] SortPostingTable()
{
// copy postingTable into an array
Posting[] array = new Posting[postingTable.Count];
System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
for (int i = 0; postings.MoveNext(); i++)
{
array[i] = (Posting) postings.Current;
}
// sort the array
QuickSort(array, 0, array.Length - 1);
return array;
}
private static void QuickSort(Posting[] postings, int lo, int hi)
{
if (lo >= hi)
return ;
int mid = (lo + hi) / 2;
if (postings[lo].term.CompareTo(postings[mid].term) > 0)
{
Posting tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if (postings[mid].term.CompareTo(postings[hi].term) > 0)
{
Posting tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (postings[lo].term.CompareTo(postings[mid].term) > 0)
{
Posting tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return ;
Term partition = postings[mid].term;
for (; ; )
{
while (postings[right].term.CompareTo(partition) > 0)
--right;
while (left < right && postings[left].term.CompareTo(partition) <= 0)
++left;
if (left < right)
{
Posting tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
}
else
{
break;
}
}
QuickSort(postings, lo, left);
QuickSort(postings, left + 1, hi);
}
private void WritePostings(Posting[] postings, System.String segment)
{
IndexOutput freq = null, prox = null;
TermInfosWriter tis = null;
TermVectorsWriter termVectorWriter = null;
try
{
//open files for inverse index storage
freq = directory.CreateOutput(segment + ".frq");
prox = directory.CreateOutput(segment + ".prx");
tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
TermInfo ti = new TermInfo();
System.String currentField = null;
for (int i = 0; i < postings.Length; i++)
{
Posting posting = postings[i];
// add an entry to the dictionary with pointers to prox and freq files
ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
tis.Add(posting.term, ti);
// add an entry to the freq file
int postingFreq = posting.freq;
if (postingFreq == 1)
// optimize freq=1
freq.WriteVInt(1);
// set low bit of doc num.
else
{
freq.WriteVInt(0); // the document number
freq.WriteVInt(postingFreq); // frequency in doc
}
int lastPosition = 0; // write positions
int[] positions = posting.positions;
for (int j = 0; j < postingFreq; j++)
{
// use delta-encoding
int position = positions[j];
prox.WriteVInt(position - lastPosition);
lastPosition = position;
}
// check to see if we switched to a new field
System.String termField = posting.term.Field();
if (currentField != termField)
{
// changing field - see if there is something to save
currentField = termField;
FieldInfo fi = fieldInfos.FieldInfo(currentField);
if (fi.storeTermVector)
{
if (termVectorWriter == null)
{
termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.OpenDocument();
}
termVectorWriter.OpenField(currentField);
}
else if (termVectorWriter != null)
{
termVectorWriter.CloseField();
}
}
if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
{
termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
}
}
if (termVectorWriter != null)
termVectorWriter.CloseDocument();
}
finally
{
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
System.IO.IOException keep = null;
if (freq != null)
try
{
freq.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (prox != null)
try
{
prox.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (tis != null)
try
{
tis.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (termVectorWriter != null)
try
{
termVectorWriter.Close();
}
catch (System.IO.IOException e)
{
if (keep == null)
keep = e;
}
if (keep != null)
{
throw new System.IO.IOException(keep.StackTrace);
}
}
}
private void WriteNorms(System.String segment)
{
for (int n = 0; n < fieldInfos.Size(); n++)
{
FieldInfo fi = fieldInfos.FieldInfo(n);
if (fi.isIndexed && !fi.omitNorms)
{
float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
IndexOutput norms = directory.CreateOutput(segment + ".f" + n);
try
{
norms.WriteByte(Similarity.EncodeNorm(norm));
}
finally
{
norms.Close();
}
}
}
}
/// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
internal void SetInfoStream(System.IO.TextWriter infoStream)
{
this.infoStream = infoStream;
}
}
sealed class Posting
{
// info about a Term in a doc
internal Term term; // the Term
internal int freq; // its frequency in doc
internal int[] positions; // positions it occurs at
internal TermVectorOffsetInfo[] offsets;
internal Posting(Term t, int position, TermVectorOffsetInfo offset)
{
term = t;
freq = 1;
positions = new int[1];
positions[0] = position;
if (offset != null)
{
offsets = new TermVectorOffsetInfo[1];
offsets[0] = offset;
}
else
offsets = null;
}
}
}
|