MtasFieldsConsumer.java
package mtas.codec;
import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import mtas.analysis.token.MtasOffset;
import mtas.analysis.token.MtasPosition;
import mtas.analysis.token.MtasToken;
import mtas.analysis.token.MtasTokenString;
import mtas.codec.payload.MtasPayloadDecoder;
import mtas.codec.tree.MtasRBTree;
import mtas.codec.tree.MtasTree;
import mtas.codec.tree.MtasTreeNode;
import mtas.codec.tree.MtasTreeNodeId;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.MappedMultiFields;
import org.apache.lucene.index.MergeState;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.ReaderSlice;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/**
* The Class MtasFieldsConsumer.
*/
/**
* The Class MtasFieldsConsumer constructs several temporal and permanent files
* to provide a forward index
*
* <ul>
* <li><b>Temporary files</b><br>
* <ul>
* <li><b>Temporary file {@link #mtasTmpFieldFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_FIELD_EXTENSION} </b><br>
* Contains for each field a reference to the list of documents. Structure of
* content:
* <ul>
* <li><b>String</b>: field</li>
* <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li>
* <li><b>VInt</b>: number of documents</li>
* <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li>
* <li><b>VInt</b>: number of terms</li>
* <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li>
* <li><b>VInt</b>: number of prefixes</li>
* </ul>
* </li>
* <li><b>Temporary file {@link #mtasTmpObjectFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_OBJECT_EXTENSION}</b><br>
* Contains for a specific field all objects constructed by
* {@link createObjectAndRegisterPrefix}. For all fields, the objects are later
* on copied to {@link #mtasObjectFileName} while statistics are collected.
* Structure of content identical to {@link #mtasObjectFileName}.</li>
* <li><b>Temporary file {@link #mtasTmpDocsFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_EXTENSION}</b> <br>
* Contains for a specific field for each doc multiple fragments. Each occurring
* term results in a fragment. Structure of content:
* <ul>
* <li><b>VInt</b>: docId</li>
* <li><b>VInt</b>: number of objects in this fragment</li>
* <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li>
* <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in
* {@link #mtasTmpObjectFileName} minus offset</li>
* <li><b>VInt</b>,<b>VLong</b>: ...</li>
* </ul>
* </li>
* <li><b>Temporary file {@link #mtasTmpDocsChainedFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOCS_CHAINED_EXTENSION}
* </b><br>
* Contains for a specific field for each doc multiple chained fragments.
* Structure of content:
* <ul>
* <li><b>VInt</b>: docId</li>
* <li><b>VInt</b>: number of objects in this fragment</li>
* <li><b>VLong</b>: offset references to {@link #mtasTmpObjectFileName}</li>
* <li><b>VInt</b>,<b>VLong</b>: mtasId object, reference temporary object in
* {@link #mtasTmpObjectFileName} minus offset</li>
* <li><b>VInt</b>,<b>VLong</b>: ...</li>
* <li><b>VLong</b>: reference to next fragment in
* {@link #mtasTmpDocsChainedFileName}, self reference indicates end of chain
* </ul>
* </li>
* <li><b>Temporary file {@link #mtasTmpDocFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TMP_DOC_EXTENSION}</b><br>
* For each document
* <ul>
* <li><b>VInt</b>: docId</li>
* <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li>
* <li><b>VLong</b>: reference first object, used as offset for tree index
* <li><b>VInt</b>: slope used in approximation reference objects index on id
* </li>
* <li><b>ZLong</b>: offset used in approximation reference objects index on id
* </li>
* <li><b>Byte</b>: flag indicating how corrections on the approximation
* references objects for the index on id are stored:
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE},
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT},
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li>
* <li><b>VInt</b>: number of objects in this document</li>
* <li><b>VInt</b>: first position</li>
* <li><b>VInt</b>: last position</li>
* </ul>
* </li>
* </ul>
* </li>
* <li><b>Final files</b><br>
* <ul>
* <li><b>File {@link #mtasIndexFieldFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_FIELD_EXTENSION}</b><br>
* Contains for each field a reference to the list of documents and the
* prefixes. Structure of content:
* <ul>
* <li><b>String</b>: field</li>
* <li><b>VLong</b>: reference to {@link #mtasDocFileName}</li>
* <li><b>VLong</b>: reference to {@link #mtasIndexDocIdFileName}</li>
* <li><b>VInt</b>: number of documents</li>
* <li><b>VLong</b>: reference to {@link #mtasTermFileName}</li>
* <li><b>VInt</b>: number of terms</li>
* <li><b>VLong</b>: reference to {@link #mtasPrefixFileName}</li>
* <li><b>VInt</b>: number of prefixes</li>
* </ul>
* </li>
* <li><b>File {@link #mtasTermFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_TERM_EXTENSION}</b><br>
* For each field, all unique terms are stored here. Structure of content:
* <ul>
* <li><b>String</b>: term</li>
* </ul>
* </li>
* <li><b>File {@link #mtasPrefixFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_PREFIX_EXTENSION}</b><br>
* For each field, all unique prefixes are stored here. Structure of content:
* <ul>
* <li><b>String</b>: prefix</li>
* </ul>
* </li>
* <li><b>File {@link #mtasObjectFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_OBJECT_EXTENSION}</b><br>
* Contains all objects for all fields. Structure of content:
* <ul>
* <li><b>VInt</b>: mtasId</li>
* <li><b>VInt</b>: objectFlags
* <ul>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}</li>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}</li>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}</li>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}</li>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}</li>
* <li>{@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}</li>
* </ul>
* </li>
* <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PARENT}<br>
* <b>VInt</b>: parentId
* <li>Only if
* {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}<br>
* <b>VInt</b>,<b>VInt</b>: startPosition and (endPosition-startPosition)
* <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br>
* <b>VInt</b>,<b>VInt</b>,<b>VInt</b>,...: number of positions, firstPosition,
* (position-previousPosition),...
* <li>Only if no {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_RANGE}
* or {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_POSITION_SET}<br>
* <b>VInt</b>: position
* <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_OFFSET}<br>
* <b>VInt</b>,<b>VInt</b>: startOffset, (endOffset-startOffset)
* <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_REALOFFSET}<br>
* <b>VInt</b>,<b>VInt</b>: startRealOffset, (endRealOffset-startRealOffset)
* <li>Only if {@link MtasCodecPostingsFormat#MTAS_OBJECT_HAS_PAYLOAD}<br>
* <b>VInt</b>,<b>Bytes</b>: number of bytes, payload
* <li><b>VLong</b>: reference to Term in {@link #mtasTermFileName}</li>
* </ul>
* </li>
* <li><b>File {@link #mtasIndexDocIdFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_DOC_ID_EXTENSION}
* </b><br>
* Contains for each field a tree structure {@link MtasTree} to search reference
* to {@link #mtasDocFileName} by id. Structure of content for each node:
* <ul>
* <li><b>VLong</b>: offset references to {@link #mtasIndexDocIdFileName}, only
* available in root node</li>
* <li><b>Byte</b>: flag, should be zero for this tree, only available in root
* node</li>
* <li><b>VInt</b>: left</li>
* <li><b>VInt</b>: right</li>
* <li><b>VInt</b>: max</li>
* <li><b>VLong</b>: left reference to {@link #mtasIndexDocIdFileName} minus the
* offset stored in the root node</li>
* <li><b>VLong</b>: right reference to {@link #mtasIndexDocIdFileName} minus
* the offset stored in the root node</li>
* <li><b>VInt</b>: number of objects on this node (always 1 for this tree)</li>
* <li><b>VLong</b>: reference to {@link #mtasDocFileName} minus offset</li>
* </ul>
* </li>
* <li><b>File {@link #mtasDocFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_DOC_EXTENSION}</b><br>
* For each document
* <ul>
* <li><b>VInt</b>: docId</li>
* <li><b>VLong</b>: reference to {@link #mtasIndexObjectIdFileName}</li>
* <li><b>VLong</b>: reference to {@link #mtasIndexObjectPositionFileName}</li>
* <li><b>VLong</b>: reference to {@link #mtasIndexObjectParentFileName}</li>
* <li><b>VLong</b>: reference first object, used as offset for tree index
* <li><b>VInt</b>: slope used in approximation reference objects index on id
* </li>
* <li><b>ZLong</b>: offset used in approximation reference objects index on id
* </li>
* <li><b>Byte</b>: flag indicating how corrections on the approximation
* references objects for the index on id are stored:
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_BYTE},
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_SHORT},
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_INTEGER} or
* {@link MtasCodecPostingsFormat#MTAS_STORAGE_LONG}</li>
* <li><b>VInt</b>: number of objects</li>
* <li><b>VInt</b>: first position</li>
* <li><b>VInt</b>: last position</li>
* </ul>
* </li>
* <li><b>File {@link #mtasIndexObjectIdFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_ID_EXTENSION}
* </b><br>
* Provides for each mtasId the reference to {@link #mtasObjectFileName}. These
* references are grouped by document, sorted by mtasId, and because the
* mtasId's for each document will always start with 0 and are sequential
* without gaps, a reference can be computed if the position of the first
* reference for a document is known from {@link #mtasDocFileName}. The
* reference is approximated by the reference to the first object plus the
* mtasId times a slope. Only a correction to this approximation is stored.
* Structure of content:
* <ul>
* <li><b>Byte</b>/<b>Short</b>/<b>Int</b>/<b>Long</b>: correction reference to
* {@link #mtasObjectFileName}</li>
* </ul>
* </li>
* <li><b>File {@link #mtasIndexObjectPositionFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_POSITION_EXTENSION}
* </b><br>
* Contains for each document a tree structure {@link MtasTree} to search
* objects by position. Structure of content for each node:
* <ul>
* <li><b>VLong</b>: offset references to
* {@link #mtasIndexObjectPositionFileName}, only available in root node</li>
* <li><b>Byte</b>: flag, should be zero for this tree, only available in root
* node</li>
* <li><b>VInt</b>: left</li>
* <li><b>VInt</b>: right</li>
* <li><b>VInt</b>: max</li>
* <li><b>VLong</b>: left reference to {@link #mtasIndexObjectPositionFileName}
* minus the offset stored in the root node</li>
* <li><b>VLong</b>: right reference to {@link #mtasIndexObjectPositionFileName}
* minus the offset stored in the root node</li>
* <li><b>VInt</b>: number of objects on this node</li>
* <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to
* {@link #mtasObjectFileName} minus offset, the prefixId referring to the
* position the prefix in {@link #mtasPrefixFileName} and the reference to
* {@link #mtasTermFileName} minus offset</li>
* <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of
* reference to {@link #mtasObjectFileName}, position of the prefix in
* {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName};
* for the first item the difference between this reference minus the previous
* reference is stored</li>
* </ul>
* </li>
* <li><b>File {@link #mtasIndexObjectParentFileName} with extension
* {@value mtas.codec.MtasCodecPostingsFormat#MTAS_INDEX_OBJECT_PARENT_EXTENSION}
* </b><br>
* Contains for each document a tree structure {@link MtasTree} to search
* objects by parent. Structure of content for each node:
* <ul>
* <li><b>VLong</b>: offset references to {@link #mtasIndexObjectParentFileName}
* , only available in root node</li>
* <li><b>Byte</b>: flag, for this tree equal to
* {@link mtas.codec.tree.MtasTree#SINGLE_POSITION_TREE} indicating a tree with
* exactly one point at each node, only available in root node</li>
* <li><b>VInt</b>: left</li>
* <li><b>VInt</b>: right</li>
* <li><b>VInt</b>: max</li>
* <li><b>VLong</b>: left reference to {@link #mtasIndexObjectParentFileName}
* minus the offset stored in the root node</li>
* <li><b>VLong</b>: right reference to {@link #mtasIndexObjectParentFileName}
* minus the offset stored in the root node</li>
* <li><b>VInt</b>: number of objects on this node</li>
* <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>: set of the first reference to
* {@link #mtasObjectFileName} minus offset, the prefixId referring to the
* position the prefix in {@link #mtasPrefixFileName} and the reference to
* {@link #mtasTermFileName} minus offset</li>
* <li><b>VLong</b>,<b>VInt</b>,<b>VLong</b>,...: for optional other sets of
* reference to {@link #mtasObjectFileName}, position of the prefix in
* {@link #mtasPrefixFileName} and the reference to {@link #mtasTermFileName};
* for the first item the difference between this reference minus the previous
* reference is stored</li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
*
*/
public class MtasFieldsConsumer extends FieldsConsumer {
/** The Constant log. */
private static final Log log = LogFactory.getLog(MtasFieldsConsumer.class);
/** The delegate fields consumer. */
private FieldsConsumer delegateFieldsConsumer;
/** The state. */
private SegmentWriteState state;
/** The intersecting prefixes. */
private HashMap<String, HashSet<String>> intersectingPrefixes;
/** The single position prefix. */
private HashMap<String, HashSet<String>> singlePositionPrefix;
/** The multiple position prefix. */
private HashMap<String, HashSet<String>> multiplePositionPrefix;
/** The set position prefix. */
private HashMap<String, HashSet<String>> setPositionPrefix;
/** The prefix reference index. */
private HashMap<String, HashMap<String, Long>> prefixReferenceIndex;
/** The prefix id index. */
private HashMap<String, HashMap<String, Integer>> prefixIdIndex;
/** The token stats min pos. */
Integer tokenStatsMinPos;
/** The token stats max pos. */
Integer tokenStatsMaxPos;
/** The token stats number. */
Integer tokenStatsNumber;
/** The mtas tmp field file name. */
private String mtasTmpFieldFileName;
/** The mtas tmp object file name. */
private String mtasTmpObjectFileName;
/** The mtas tmp docs file name. */
private String mtasTmpDocsFileName;
/** The mtas tmp doc file name. */
private String mtasTmpDocFileName;
/** The mtas tmp docs chained file name. */
private String mtasTmpDocsChainedFileName;
/** The mtas object file name. */
private String mtasObjectFileName;
/** The mtas term file name. */
private String mtasTermFileName;
/** The mtas index field file name. */
private String mtasIndexFieldFileName;
/** The mtas prefix file name. */
private String mtasPrefixFileName;
/** The mtas doc file name. */
private String mtasDocFileName;
/** The mtas index doc id file name. */
private String mtasIndexDocIdFileName;
/** The mtas index object id file name. */
private String mtasIndexObjectIdFileName;
/** The mtas index object position file name. */
private String mtasIndexObjectPositionFileName;
/** The mtas index object parent file name. */
private String mtasIndexObjectParentFileName;
/** The name. */
private String name;
/** The delegate postings format name. */
private String delegatePostingsFormatName;
/**
* Instantiates a new mtas fields consumer.
*
* @param fieldsConsumer
* the fields consumer
* @param state
* the state
* @param name
* the name
* @param delegatePostingsFormatName
* the delegate postings format name
*/
public MtasFieldsConsumer(FieldsConsumer fieldsConsumer,
SegmentWriteState state, String name, String delegatePostingsFormatName) {
this.delegateFieldsConsumer = fieldsConsumer;
this.state = state;
this.name = name;
this.delegatePostingsFormatName = delegatePostingsFormatName;
// temporary fileNames
mtasTmpFieldFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_TMP_FIELD_EXTENSION);
mtasTmpObjectFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_TMP_OBJECT_EXTENSION);
mtasTmpDocsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_TMP_DOCS_EXTENSION);
mtasTmpDocFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_TMP_DOC_EXTENSION);
mtasTmpDocsChainedFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_TMP_DOCS_CHAINED_EXTENSION);
// fileNames
mtasObjectFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_OBJECT_EXTENSION);
mtasTermFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_TERM_EXTENSION);
mtasIndexFieldFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_FIELD_EXTENSION);
mtasPrefixFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_PREFIX_EXTENSION);
mtasDocFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, MtasCodecPostingsFormat.MTAS_DOC_EXTENSION);
mtasIndexDocIdFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_INDEX_DOC_ID_EXTENSION);
mtasIndexObjectIdFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_INDEX_OBJECT_ID_EXTENSION);
mtasIndexObjectPositionFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_INDEX_OBJECT_POSITION_EXTENSION);
mtasIndexObjectParentFileName = IndexFileNames.segmentFileName(
state.segmentInfo.name, state.segmentSuffix,
MtasCodecPostingsFormat.MTAS_INDEX_OBJECT_PARENT_EXTENSION);
}
/**
* Register prefix.
*
* @param field
* the field
* @param prefix
* the prefix
* @param outPrefix
* the out prefix
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private void registerPrefix(String field, String prefix,
IndexOutput outPrefix) throws IOException {
if (!prefixReferenceIndex.containsKey(field)) {
prefixReferenceIndex.put(field, new HashMap<String, Long>());
prefixIdIndex.put(field, new HashMap<String, Integer>());
}
if (!prefixReferenceIndex.get(field).containsKey(prefix)) {
int id = 1 + prefixReferenceIndex.get(field).size();
prefixReferenceIndex.get(field).put(prefix, outPrefix.getFilePointer());
prefixIdIndex.get(field).put(prefix, id);
outPrefix.writeString(prefix);
}
}
/**
* Register prefix intersection.
*
* @param field
* the field
* @param prefix
* the prefix
* @param start
* the start
* @param end
* the end
* @param docFieldAdministration
* the doc field administration
*/
private void registerPrefixIntersection(String field, String prefix,
int start, int end,
HashMap<String, HashSet<Integer>> docFieldAdministration) {
if (!intersectingPrefixes.containsKey(field)) {
intersectingPrefixes.put(field, new HashSet<String>());
} else if (intersectingPrefixes.get(field).contains(prefix)) {
return;
}
HashSet<Integer> docFieldPrefixAdministration;
if (!docFieldAdministration.containsKey(prefix)) {
docFieldPrefixAdministration = new HashSet<>();
docFieldAdministration.put(prefix, docFieldPrefixAdministration);
} else {
docFieldPrefixAdministration = docFieldAdministration.get(prefix);
// check
for (int p = start; p <= end; p++) {
if (docFieldPrefixAdministration.contains(p)) {
intersectingPrefixes.get(field).add(prefix);
docFieldAdministration.remove(prefix);
return;
}
}
}
// update
for (int p = start; p <= end; p++) {
docFieldPrefixAdministration.add(p);
}
}
/**
* Register prefix stats single position value.
*
* @param field
* the field
* @param prefix
* the prefix
* @param outPrefix
* the out prefix
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public void registerPrefixStatsSinglePositionValue(String field,
String prefix, IndexOutput outPrefix) throws IOException {
initPrefixStatsField(field);
registerPrefix(field, prefix, outPrefix);
if (!multiplePositionPrefix.get(field).contains(prefix)) {
singlePositionPrefix.get(field).add(prefix);
}
}
/**
* Register prefix stats range position value.
*
* @param field
* the field
* @param prefix
* the prefix
* @param outPrefix
* the out prefix
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public void registerPrefixStatsRangePositionValue(String field, String prefix,
IndexOutput outPrefix) throws IOException {
initPrefixStatsField(field);
registerPrefix(field, prefix, outPrefix);
singlePositionPrefix.get(field).remove(prefix);
multiplePositionPrefix.get(field).add(prefix);
}
/**
* Register prefix stats set position value.
*
* @param field
* the field
* @param prefix
* the prefix
* @param outPrefix
* the out prefix
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public void registerPrefixStatsSetPositionValue(String field, String prefix,
IndexOutput outPrefix) throws IOException {
initPrefixStatsField(field);
registerPrefix(field, prefix, outPrefix);
singlePositionPrefix.get(field).remove(prefix);
multiplePositionPrefix.get(field).add(prefix);
setPositionPrefix.get(field).add(prefix);
}
/**
* Inits the prefix stats field.
*
* @param field
* the field
*/
private void initPrefixStatsField(String field) {
if (!singlePositionPrefix.containsKey(field)) {
singlePositionPrefix.put(field, new HashSet<String>());
}
if (!multiplePositionPrefix.containsKey(field)) {
multiplePositionPrefix.put(field, new HashSet<String>());
}
if (!setPositionPrefix.containsKey(field)) {
setPositionPrefix.put(field, new HashSet<String>());
}
}
/**
* Gets the prefix stats single position prefix attribute.
*
* @param field
* the field
* @return the prefix stats single position prefix attribute
*/
public String getPrefixStatsSinglePositionPrefixAttribute(String field) {
return String.join(MtasToken.DELIMITER, singlePositionPrefix.get(field));
}
/**
* Gets the prefix stats multiple position prefix attribute.
*
* @param field
* the field
* @return the prefix stats multiple position prefix attribute
*/
public String getPrefixStatsMultiplePositionPrefixAttribute(String field) {
return String.join(MtasToken.DELIMITER, multiplePositionPrefix.get(field));
}
/**
* Gets the prefix stats set position prefix attribute.
*
* @param field
* the field
* @return the prefix stats set position prefix attribute
*/
public String getPrefixStatsSetPositionPrefixAttribute(String field) {
return String.join(MtasToken.DELIMITER, setPositionPrefix.get(field));
}
/**
* Gets the prefix stats intersection prefix attribute.
*
* @param field
* the field
* @return the prefix stats intersection prefix attribute
*/
public String getPrefixStatsIntersectionPrefixAttribute(String field) {
if (intersectingPrefixes.containsKey(field)) {
return String.join(MtasToken.DELIMITER, intersectingPrefixes.get(field));
} else {
return "";
}
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.codecs.FieldsConsumer#merge(org.apache.lucene.index.
* MergeState)
*/
@Override
public void merge(MergeState mergeState) throws IOException {
final List<Fields> fields = new ArrayList<>();
final List<ReaderSlice> slices = new ArrayList<>();
int docBase = 0;
for (int readerIndex = 0; readerIndex < mergeState.fieldsProducers.length; readerIndex++) {
final FieldsProducer f = mergeState.fieldsProducers[readerIndex];
final int maxDoc = mergeState.maxDocs[readerIndex];
f.checkIntegrity();
slices.add(new ReaderSlice(docBase, maxDoc, readerIndex));
fields.add(f);
docBase += maxDoc;
}
Fields mergedFields = new MappedMultiFields(mergeState,
new MultiFields(fields.toArray(Fields.EMPTY_ARRAY),
slices.toArray(ReaderSlice.EMPTY_ARRAY)));
write(mergedFields);
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.codecs.FieldsConsumer#write(org.apache.lucene.index.
* Fields )
*/
@Override
public void write(Fields fields) throws IOException {
delegateFieldsConsumer.write(fields);
write(state.fieldInfos, fields);
}
/**
* Write.
*
* @param fieldInfos
* the field infos
* @param fields
* the fields
*/
private void write(FieldInfos fieldInfos, Fields fields) {
IndexOutput outField;
IndexOutput outDoc;
IndexOutput outIndexDocId;
IndexOutput outIndexObjectId;
IndexOutput outIndexObjectPosition;
IndexOutput outIndexObjectParent;
IndexOutput outTerm;
IndexOutput outObject;
IndexOutput outPrefix;
IndexOutput outTmpDoc;
IndexOutput outTmpField;
HashSet<Closeable> closeables = new HashSet<>();
// prefix stats
intersectingPrefixes = new HashMap<>();
singlePositionPrefix = new HashMap<>();
multiplePositionPrefix = new HashMap<>();
setPositionPrefix = new HashMap<>();
prefixReferenceIndex = new HashMap<>();
prefixIdIndex = new HashMap<>();
// temporary temporary index in memory for doc
SortedMap<Integer, Long> memoryIndexTemporaryObject = new TreeMap<>();
// create (backwards) chained new temporary index docs
SortedMap<Integer, Long> memoryTmpDocChainList = new TreeMap<>();
// list of objectIds and references to objects
SortedMap<Integer, Long> memoryIndexDocList = new TreeMap<>();
try {
// create file tmpDoc
closeables.add(outTmpDoc = state.directory
.createOutput(mtasTmpDocFileName, state.context));
// create file tmpField
closeables.add(outTmpField = state.directory
.createOutput(mtasTmpFieldFileName, state.context));
// create file indexDoc
closeables.add(outDoc = state.directory.createOutput(mtasDocFileName,
state.context));
CodecUtil.writeIndexHeader(outDoc, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outDoc.writeString(delegatePostingsFormatName);
// create file indexDocId
closeables.add(outIndexDocId = state.directory
.createOutput(mtasIndexDocIdFileName, state.context));
CodecUtil.writeIndexHeader(outIndexDocId, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outIndexDocId.writeString(delegatePostingsFormatName);
// create file indexObjectId
closeables.add(outIndexObjectId = state.directory
.createOutput(mtasIndexObjectIdFileName, state.context));
CodecUtil.writeIndexHeader(outIndexObjectId, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outIndexObjectId.writeString(delegatePostingsFormatName);
// create file indexObjectPosition
closeables.add(outIndexObjectPosition = state.directory
.createOutput(mtasIndexObjectPositionFileName, state.context));
CodecUtil.writeIndexHeader(outIndexObjectPosition, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outIndexObjectPosition.writeString(delegatePostingsFormatName);
// create file indexObjectParent
closeables.add(outIndexObjectParent = state.directory
.createOutput(mtasIndexObjectParentFileName, state.context));
CodecUtil.writeIndexHeader(outIndexObjectParent, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outIndexObjectParent.writeString(delegatePostingsFormatName);
// create file term
closeables.add(outTerm = state.directory.createOutput(mtasTermFileName,
state.context));
CodecUtil.writeIndexHeader(outTerm, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outTerm.writeString(delegatePostingsFormatName);
// create file prefix
closeables.add(outPrefix = state.directory
.createOutput(mtasPrefixFileName, state.context));
CodecUtil.writeIndexHeader(outPrefix, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outPrefix.writeString(delegatePostingsFormatName);
// create file object
closeables.add(outObject = state.directory
.createOutput(mtasObjectFileName, state.context));
CodecUtil.writeIndexHeader(outObject, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outObject.writeString(delegatePostingsFormatName);
// For each field
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
} else {
// new temporary object storage for this field
IndexOutput outTmpObject = state.directory
.createOutput(mtasTmpObjectFileName, state.context);
closeables.add(outTmpObject);
// new temporary index docs for this field
IndexOutput outTmpDocs = state.directory
.createOutput(mtasTmpDocsFileName, state.context);
closeables.add(outTmpDocs);
// get fieldInfo
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
// get properties terms
boolean hasPositions = terms.hasPositions();
boolean hasFreqs = terms.hasFreqs();
boolean hasPayloads = fieldInfo.hasPayloads();
boolean hasOffsets = terms.hasOffsets();
// register references
Long smallestTermFilepointer = outTerm.getFilePointer();
Long smallestPrefixFilepointer = outPrefix.getFilePointer();
int termCounter = 0;
// only if freqs, positions and payload available
if (hasFreqs && hasPositions && hasPayloads) {
// compute flags
int flags = PostingsEnum.POSITIONS | PostingsEnum.PAYLOADS;
if (hasOffsets) {
flags = flags | PostingsEnum.OFFSETS;
}
// get terms
TermsEnum termsEnum = terms.iterator();
PostingsEnum postingsEnum = null;
// for each term in field
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
// store term and get ref
Long termRef = outTerm.getFilePointer();
outTerm.writeString(term.utf8ToString());
termCounter++;
// get postings
postingsEnum = termsEnum.postings(postingsEnum, flags);
// for each doc in field+term
while (true) {
Integer docId = postingsEnum.nextDoc();
if (docId.equals(DocIdSetIterator.NO_MORE_DOCS)) {
break;
}
int freq = postingsEnum.freq();
// temporary storage objects and temporary index in memory for
// doc
memoryIndexTemporaryObject.clear();
Long offsetFilePointerTmpObject = outTmpObject.getFilePointer();
for (int i = 0; i < freq; i++) {
Long currentFilePointerTmpObject = outTmpObject
.getFilePointer();
Integer mtasId;
int position = postingsEnum.nextPosition();
BytesRef payload = postingsEnum.getPayload();
if (hasOffsets) {
mtasId = createObjectAndRegisterPrefix(field, outTmpObject,
term, termRef, position, payload,
postingsEnum.startOffset(), postingsEnum.endOffset(),
outPrefix);
} else {
mtasId = createObjectAndRegisterPrefix(field, outTmpObject,
term, termRef, position, payload, outPrefix);
}
if (mtasId != null) {
assert !memoryIndexTemporaryObject.containsKey(
mtasId) : "mtasId should be unique in this selection";
memoryIndexTemporaryObject.put(mtasId,
currentFilePointerTmpObject);
}
} // end loop positions
// store temporary index for this doc
if (memoryIndexTemporaryObject.size() > 0) {
// docId for this part
outTmpDocs.writeVInt(docId);
// number of objects/tokens in this part
outTmpDocs.writeVInt(memoryIndexTemporaryObject.size());
// offset to be used for references
outTmpDocs.writeVLong(offsetFilePointerTmpObject);
// loop over tokens
for (Entry<Integer, Long> entry : memoryIndexTemporaryObject
.entrySet()) {
// mtasId object
outTmpDocs.writeVInt(entry.getKey());
// reference object
outTmpDocs.writeVLong(
(entry.getValue() - offsetFilePointerTmpObject));
}
}
// clean up
memoryIndexTemporaryObject.clear();
} // end loop docs
} // end loop terms
// set fieldInfo
fieldInfos.fieldInfo(field).putAttribute(
MtasCodecPostingsFormat.MTAS_FIELDINFO_ATTRIBUTE_PREFIX_SINGLE_POSITION,
getPrefixStatsSinglePositionPrefixAttribute(field));
fieldInfos.fieldInfo(field).putAttribute(
MtasCodecPostingsFormat.MTAS_FIELDINFO_ATTRIBUTE_PREFIX_MULTIPLE_POSITION,
getPrefixStatsMultiplePositionPrefixAttribute(field));
fieldInfos.fieldInfo(field).putAttribute(
MtasCodecPostingsFormat.MTAS_FIELDINFO_ATTRIBUTE_PREFIX_SET_POSITION,
getPrefixStatsSetPositionPrefixAttribute(field));
} // end processing field with freqs, positions and payload
// close temporary object storage and index docs
outTmpObject.close();
closeables.remove(outTmpObject);
outTmpDocs.close();
closeables.remove(outTmpDocs);
// create (backwards) chained new temporary index docs
IndexInput inTmpDocs = state.directory.openInput(mtasTmpDocsFileName,
state.context);
closeables.add(inTmpDocs);
IndexOutput outTmpDocsChained = state.directory
.createOutput(mtasTmpDocsChainedFileName, state.context);
closeables.add(outTmpDocsChained);
memoryTmpDocChainList.clear();
while (true) {
try {
Long currentFilepointer = outTmpDocsChained.getFilePointer();
// copy docId
int docId = inTmpDocs.readVInt();
outTmpDocsChained.writeVInt(docId);
// copy size
int size = inTmpDocs.readVInt();
outTmpDocsChained.writeVInt(size);
// offset references
outTmpDocsChained.writeVLong(inTmpDocs.readVLong());
for (int t = 0; t < size; t++) {
outTmpDocsChained.writeVInt(inTmpDocs.readVInt());
outTmpDocsChained.writeVLong(inTmpDocs.readVLong());
}
// set back reference to part with same docId
if (memoryTmpDocChainList.containsKey(docId)) {
// reference to previous
outTmpDocsChained.writeVLong(memoryTmpDocChainList.get(docId));
} else {
// self reference indicates end of chain
outTmpDocsChained.writeVLong(currentFilepointer);
}
// update temporary index in memory
memoryTmpDocChainList.put(docId, currentFilepointer);
} catch (IOException ex) {
log.debug(ex);
break;
}
}
outTmpDocsChained.close();
closeables.remove(outTmpDocsChained);
inTmpDocs.close();
closeables.remove(inTmpDocs);
state.directory.deleteFile(mtasTmpDocsFileName);
// set reference to tmpDoc in Field
if (memoryTmpDocChainList.size() > 0) {
outTmpField.writeString(field);
outTmpField.writeVLong(outTmpDoc.getFilePointer());
outTmpField.writeVInt(memoryTmpDocChainList.size());
outTmpField.writeVLong(smallestTermFilepointer);
outTmpField.writeVInt(termCounter);
outTmpField.writeVLong(smallestPrefixFilepointer);
outTmpField.writeVInt(prefixReferenceIndex.get(field).size());
// fill indexDoc
IndexInput inTmpDocsChained = state.directory
.openInput(mtasTmpDocsChainedFileName, state.context);
closeables.add(inTmpDocsChained);
IndexInput inTmpObject = state.directory
.openInput(mtasTmpObjectFileName, state.context);
closeables.add(inTmpObject);
for (Entry<Integer, Long> entry : memoryTmpDocChainList
.entrySet()) {
Integer docId = entry.getKey();
Long currentFilePointer;
Long newFilePointer;
// list of objectIds and references to objects
memoryIndexDocList.clear();
// construct final object + indexObjectId for docId
currentFilePointer = entry.getValue();
// collect objects for document
tokenStatsMinPos = null;
tokenStatsMaxPos = null;
tokenStatsNumber = 0;
while (true) {
inTmpDocsChained.seek(currentFilePointer);
Integer docIdPart = inTmpDocsChained.readVInt();
assert docIdPart.equals(
docId) : "conflicting docId in reference to temporaryIndexDocsChained";
// number of objects/tokens in part
int size = inTmpDocsChained.readVInt();
long offsetFilePointerTmpObject = inTmpDocsChained.readVLong();
assert size > 0 : "number of objects/tokens in part cannot be "
+ size;
for (int t = 0; t < size; t++) {
int mtasId = inTmpDocsChained.readVInt();
Long tmpObjectRef = inTmpDocsChained.readVLong()
+ offsetFilePointerTmpObject;
assert !memoryIndexDocList.containsKey(
mtasId) : "mtasId should be unique in this selection";
// initially, store ref to tmpObject
memoryIndexDocList.put(mtasId, tmpObjectRef);
}
// reference to next part
newFilePointer = inTmpDocsChained.readVLong();
if (newFilePointer.equals(currentFilePointer)) {
break; // end of chained parts
} else {
currentFilePointer = newFilePointer;
}
}
// now create new objects, sorted by mtasId
Long smallestObjectFilepointer = outObject.getFilePointer();
for (Entry<Integer, Long> objectEntry : memoryIndexDocList
.entrySet()) {
int mtasId = objectEntry.getKey();
Long tmpObjectRef = objectEntry.getValue();
Long objectRef = outObject.getFilePointer();
copyObjectAndUpdateStats(mtasId, inTmpObject, tmpObjectRef,
outObject);
// update with new ref
memoryIndexDocList.put(mtasId, objectRef);
}
// check mtasIds properties
assert memoryIndexDocList.firstKey()
.equals(0) : "first mtasId should not be "
+ memoryIndexDocList.firstKey();
assert (1 + memoryIndexDocList.lastKey()
- memoryIndexDocList.firstKey()) == memoryIndexDocList
.size() : "missing mtasId";
assert tokenStatsNumber.equals(memoryIndexDocList
.size()) : "incorrect number of items in tokenStats";
// store item in tmpDoc
outTmpDoc.writeVInt(docId);
outTmpDoc.writeVLong(outIndexObjectId.getFilePointer());
int mtasId = 0;
// compute linear approximation (least squares method, integer
// constants)
long tmpN = memoryIndexDocList.size();
long tmpSumY = 0;
long tmpSumXY = 0;
long tmpSumX = 0;
long tmpSumXX = 0;
for (Entry<Integer, Long> objectEntry : memoryIndexDocList
.entrySet()) {
assert objectEntry.getKey()
.equals(mtasId) : "unexpected mtasId";
tmpSumY += objectEntry.getValue();
tmpSumX += mtasId;
tmpSumXY += mtasId * objectEntry.getValue();
tmpSumXX += mtasId * mtasId;
mtasId++;
}
int objectRefApproxQuotient;
if(tmpN>1) {
objectRefApproxQuotient= (int) (((tmpN * tmpSumXY)
- (tmpSumX * tmpSumY))
/ ((tmpN * tmpSumXX) - (tmpSumX * tmpSumX)));
} else {
objectRefApproxQuotient = 0;
}
long objectRefApproxOffset = (tmpSumY
- objectRefApproxQuotient * tmpSumX) / tmpN;
Long objectRefApproxCorrection;
long maxAbsObjectRefApproxCorrection = 0;
// compute maximum correction
mtasId = 0;
for (Entry<Integer, Long> objectEntry : memoryIndexDocList
.entrySet()) {
objectRefApproxCorrection = (objectEntry.getValue()
- (objectRefApproxOffset
+ (mtasId * objectRefApproxQuotient)));
maxAbsObjectRefApproxCorrection = Math.max(
maxAbsObjectRefApproxCorrection,
Math.abs(objectRefApproxCorrection));
mtasId++;
}
byte storageFlags;
if (maxAbsObjectRefApproxCorrection <= Long
.valueOf(Byte.MAX_VALUE)) {
storageFlags = MtasCodecPostingsFormat.MTAS_STORAGE_BYTE;
} else if (maxAbsObjectRefApproxCorrection <= Long
.valueOf(Short.MAX_VALUE)) {
storageFlags = MtasCodecPostingsFormat.MTAS_STORAGE_SHORT;
} else if (maxAbsObjectRefApproxCorrection <= Long
.valueOf(Integer.MAX_VALUE)) {
storageFlags = MtasCodecPostingsFormat.MTAS_STORAGE_INTEGER;
} else {
storageFlags = MtasCodecPostingsFormat.MTAS_STORAGE_LONG;
}
// update indexObjectId with correction on approximated ref
// (assume
// can be stored as int)
mtasId = 0;
for (Entry<Integer, Long> objectEntry : memoryIndexDocList
.entrySet()) {
objectRefApproxCorrection = (objectEntry.getValue()
- (objectRefApproxOffset
+ (mtasId * objectRefApproxQuotient)));
if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_BYTE) {
outIndexObjectId
.writeByte(objectRefApproxCorrection.byteValue());
} else if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_SHORT) {
outIndexObjectId
.writeShort(objectRefApproxCorrection.shortValue());
} else if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_INTEGER) {
outIndexObjectId
.writeInt(objectRefApproxCorrection.intValue());
} else {
outIndexObjectId.writeLong(objectRefApproxCorrection);
}
mtasId++;
}
outTmpDoc.writeVLong(smallestObjectFilepointer);
outTmpDoc.writeVInt(objectRefApproxQuotient);
outTmpDoc.writeZLong(objectRefApproxOffset);
outTmpDoc.writeByte(storageFlags);
outTmpDoc.writeVInt(tokenStatsNumber);
outTmpDoc.writeVInt(tokenStatsMinPos);
outTmpDoc.writeVInt(tokenStatsMaxPos);
// clean up
memoryIndexDocList.clear();
} // end loop over docs
inTmpDocsChained.close();
closeables.remove(inTmpDocsChained);
inTmpObject.close();
closeables.remove(inTmpObject);
}
// clean up
memoryTmpDocChainList.clear();
// remove temporary files
state.directory.deleteFile(mtasTmpObjectFileName);
state.directory.deleteFile(mtasTmpDocsChainedFileName);
// store references for field
} // end processing field
} // end loop fields
// close temporary index doc
outTmpDoc.close();
closeables.remove(outTmpDoc);
// close indexField, indexObjectId and object
CodecUtil.writeFooter(outTmpField);
outTmpField.close();
closeables.remove(outTmpField);
CodecUtil.writeFooter(outIndexObjectId);
outIndexObjectId.close();
closeables.remove(outIndexObjectId);
CodecUtil.writeFooter(outObject);
outObject.close();
closeables.remove(outObject);
CodecUtil.writeFooter(outTerm);
outTerm.close();
closeables.remove(outTerm);
CodecUtil.writeFooter(outPrefix);
outPrefix.close();
closeables.remove(outPrefix);
// create final doc, fill indexObjectPosition, indexObjectParent and
// indexTermPrefixPosition, create final field
IndexInput inTmpField = state.directory.openInput(mtasTmpFieldFileName,
state.context);
closeables.add(inTmpField);
IndexInput inTmpDoc = state.directory.openInput(mtasTmpDocFileName,
state.context);
closeables.add(inTmpDoc);
IndexInput inObjectId = state.directory
.openInput(mtasIndexObjectIdFileName, state.context);
closeables.add(inObjectId);
IndexInput inObject = state.directory.openInput(mtasObjectFileName,
state.context);
closeables.add(inObject);
IndexInput inTerm = state.directory.openInput(mtasTermFileName,
state.context);
closeables.add(inTerm);
closeables.add(outField = state.directory
.createOutput(mtasIndexFieldFileName, state.context));
CodecUtil.writeIndexHeader(outField, name,
MtasCodecPostingsFormat.VERSION_CURRENT, state.segmentInfo.getId(),
state.segmentSuffix);
outField.writeString(delegatePostingsFormatName);
boolean doWrite = true;
do {
try {
// read from tmpField
String field = inTmpField.readString();
long fpTmpDoc = inTmpField.readVLong();
int numberDocs = inTmpField.readVInt();
long fpTerm = inTmpField.readVLong();
int numberTerms = inTmpField.readVInt();
long fpPrefix = inTmpField.readVLong();
int numberPrefixes = inTmpField.readVInt();
inTmpDoc.seek(fpTmpDoc);
long fpFirstDoc = outDoc.getFilePointer();
// get prefixId index
HashMap<String, Integer> prefixIdIndexField = prefixIdIndex
.get(field);
// construct MtasRBTree for indexDocId
MtasRBTree mtasDocIdTree = new MtasRBTree(true, false);
for (int docCounter = 0; docCounter < numberDocs; docCounter++) {
// get info from tmpDoc
int docId = inTmpDoc.readVInt();
// filePointer indexObjectId
Long fpIndexObjectId = inTmpDoc.readVLong();
// filePointer indexObjectPosition (unknown)
Long fpIndexObjectPosition;
// filePointer indexObjectParent (unknown)
Long fpIndexObjectParent;
// constants for approximation object references for this document
long smallestObjectFilepointer = inTmpDoc.readVLong();
int objectRefApproxQuotient = inTmpDoc.readVInt();
long objectRefApproxOffset = inTmpDoc.readZLong();
byte storageFlags = inTmpDoc.readByte();
// number objects/tokens
int size = inTmpDoc.readVInt();
// construct MtasRBTree
MtasRBTree mtasPositionTree = new MtasRBTree(false, true);
MtasRBTree mtasParentTree = new MtasRBTree(false, true);
inObjectId.seek(fpIndexObjectId);
long refCorrection;
long ref;
HashMap<String, HashSet<Integer>> docFieldAdministration = new HashMap<>();
for (int mtasId = 0; mtasId < size; mtasId++) {
if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_BYTE) {
refCorrection = inObjectId.readByte();
} else if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_SHORT) {
refCorrection = inObjectId.readShort();
} else if (storageFlags == MtasCodecPostingsFormat.MTAS_STORAGE_INTEGER) {
refCorrection = inObjectId.readInt();
} else {
refCorrection = inObjectId.readLong();
}
ref = objectRefApproxOffset + mtasId * objectRefApproxQuotient
+ refCorrection;
MtasTokenString token = MtasCodecPostingsFormat.getToken(inObject,
inTerm, ref);
String prefix = token.getPrefix();
registerPrefixIntersection(field, prefix,
token.getPositionStart(), token.getPositionEnd(),
docFieldAdministration);
int prefixId = prefixIdIndexField.containsKey(prefix)
? prefixIdIndexField.get(prefix) : 0;
token.setPrefixId(prefixId);
assert token.getId().equals(mtasId) : "unexpected mtasId "
+ mtasId;
mtasPositionTree.addPositionAndObjectFromToken(token);
mtasParentTree.addParentFromToken(token);
}
// store mtasPositionTree and mtasParentTree
fpIndexObjectPosition = storeTree(mtasPositionTree,
outIndexObjectPosition, smallestObjectFilepointer);
fpIndexObjectParent = storeTree(mtasParentTree,
outIndexObjectParent, smallestObjectFilepointer);
long fpDoc = outDoc.getFilePointer();
// create indexDoc with updated fpIndexObjectPosition from tmpDoc
outDoc.writeVInt(docId); // docId
// reference indexObjectId
outDoc.writeVLong(fpIndexObjectId);
// reference indexObjectPosition
outDoc.writeVLong(fpIndexObjectPosition);
// reference indexObjectParent
outDoc.writeVLong(fpIndexObjectParent);
// variables approximation and storage references object
outDoc.writeVLong(smallestObjectFilepointer);
outDoc.writeVInt(objectRefApproxQuotient);
outDoc.writeZLong(objectRefApproxOffset);
outDoc.writeByte(storageFlags);
// number of objects
outDoc.writeVInt(size);
// minPosition
outDoc.writeVInt(inTmpDoc.readVInt());
// maxPosition
outDoc.writeVInt(inTmpDoc.readVInt());
// add to tree for indexDocId
mtasDocIdTree.addIdFromDoc(docId, fpDoc);
}
long fpIndexDocId = storeTree(mtasDocIdTree, outIndexDocId,
fpFirstDoc);
// store in indexField
outField.writeString(field);
outField.writeVLong(fpFirstDoc);
outField.writeVLong(fpIndexDocId);
outField.writeVInt(numberDocs);
outField.writeVLong(fpTerm);
outField.writeVInt(numberTerms);
outField.writeVLong(fpPrefix);
outField.writeVInt(numberPrefixes);
// register intersection
fieldInfos.fieldInfo(field).putAttribute(
MtasCodecPostingsFormat.MTAS_FIELDINFO_ATTRIBUTE_PREFIX_INTERSECTION,
getPrefixStatsIntersectionPrefixAttribute(field));
} catch (EOFException e) {
log.debug(e);
doWrite = false;
}
// end loop over fields
} while (doWrite);
inTerm.close();
closeables.remove(inTerm);
inObject.close();
closeables.remove(inObject);
inObjectId.close();
closeables.remove(inObjectId);
inTmpDoc.close();
closeables.remove(inTmpDoc);
inTmpField.close();
closeables.remove(inTmpField);
// remove temporary files
state.directory.deleteFile(mtasTmpDocFileName);
state.directory.deleteFile(mtasTmpFieldFileName);
// close indexDoc, indexObjectPosition and indexObjectParent
CodecUtil.writeFooter(outDoc);
outDoc.close();
closeables.remove(outDoc);
CodecUtil.writeFooter(outIndexObjectPosition);
outIndexObjectPosition.close();
closeables.remove(outIndexObjectPosition);
CodecUtil.writeFooter(outIndexObjectParent);
outIndexObjectParent.close();
closeables.remove(outIndexObjectParent);
CodecUtil.writeFooter(outIndexDocId);
outIndexDocId.close();
closeables.remove(outIndexDocId);
CodecUtil.writeFooter(outField);
outField.close();
closeables.remove(outField);
} catch (IOException e) {
// ignore, can happen when merging segment already written by
// delegateFieldsConsumer
log.error(e);
} finally {
IOUtils.closeWhileHandlingException(closeables);
try {
state.directory.deleteFile(mtasTmpDocsFileName);
} catch (IOException e) {
log.debug(e);
}
try {
state.directory.deleteFile(mtasTmpDocFileName);
} catch (IOException e) {
log.debug(e);
}
try {
state.directory.deleteFile(mtasTmpFieldFileName);
} catch (IOException e) {
log.debug(e);
}
}
}
/**
* Creates the object and register prefix.
*
* @param field
* the field
* @param out
* the out
* @param term
* the term
* @param termRef
* the term ref
* @param startPosition
* the start position
* @param payload
* the payload
* @param outPrefix
* the out prefix
* @return the integer
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Integer createObjectAndRegisterPrefix(String field, IndexOutput out,
BytesRef term, Long termRef, int startPosition, BytesRef payload,
IndexOutput outPrefix) throws IOException {
return createObjectAndRegisterPrefix(field, out, term, termRef,
startPosition, payload, null, null, outPrefix);
}
/**
* Creates the object and register prefix.
*
* @param field
* the field
* @param out
* the out
* @param term
* the term
* @param termRef
* the term ref
* @param startPosition
* the start position
* @param payload
* the payload
* @param startOffset
* the start offset
* @param endOffset
* the end offset
* @param outPrefix
* the out prefix
* @return the integer
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Integer createObjectAndRegisterPrefix(String field, IndexOutput out,
BytesRef term, Long termRef, int startPosition, BytesRef payload,
Integer startOffset, Integer endOffset, IndexOutput outPrefix)
throws IOException {
try {
Integer mtasId = null;
String prefix = MtasToken.getPrefixFromValue(term.utf8ToString());
if (payload != null) {
MtasPayloadDecoder payloadDecoder = new MtasPayloadDecoder();
payloadDecoder.init(startPosition, Arrays.copyOfRange(payload.bytes,
payload.offset, (payload.offset + payload.length)));
mtasId = payloadDecoder.getMtasId();
Integer mtasParentId = payloadDecoder.getMtasParentId();
byte[] mtasPayload = payloadDecoder.getMtasPayload();
MtasPosition mtasPosition = payloadDecoder.getMtasPosition();
MtasOffset mtasOffset = payloadDecoder.getMtasOffset();
if (mtasOffset == null && startOffset != null) {
mtasOffset = new MtasOffset(startOffset, endOffset);
}
MtasOffset mtasRealOffset = payloadDecoder.getMtasRealOffset();
// only if really mtas object
if (mtasId != null) {
// compute flags
int objectFlags = 0;
if (mtasPosition != null) {
if (mtasPosition.checkType(MtasPosition.POSITION_RANGE)) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_RANGE;
registerPrefixStatsRangePositionValue(field, prefix, outPrefix);
} else if (mtasPosition.checkType(MtasPosition.POSITION_SET)) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_SET;
registerPrefixStatsSetPositionValue(field, prefix, outPrefix);
} else {
registerPrefixStatsSinglePositionValue(field, prefix, outPrefix);
}
} else {
throw new IOException("no position");
}
if (mtasParentId != null) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PARENT;
}
if (mtasOffset != null) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_OFFSET;
}
if (mtasRealOffset != null) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_REALOFFSET;
}
if (mtasPayload != null) {
objectFlags = objectFlags
| MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PAYLOAD;
}
// create object
out.writeVInt(mtasId);
out.writeVInt(objectFlags);
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PARENT) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PARENT) {
out.writeVInt(mtasParentId);
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_RANGE) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_RANGE) {
int tmpStart = mtasPosition.getStart();
out.writeVInt(tmpStart);
out.writeVInt((mtasPosition.getEnd() - tmpStart));
} else if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_SET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_SET) {
int[] positions = mtasPosition.getPositions();
out.writeVInt(positions.length);
int tmpPrevious = 0;
for (int position : positions) {
out.writeVInt((position - tmpPrevious));
tmpPrevious = position;
}
} else {
out.writeVInt(mtasPosition.getStart());
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_OFFSET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_OFFSET) {
int tmpStart = mtasOffset.getStart();
out.writeVInt(mtasOffset.getStart());
out.writeVInt((mtasOffset.getEnd() - tmpStart));
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_REALOFFSET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_REALOFFSET) {
int tmpStart = mtasRealOffset.getStart();
out.writeVInt(mtasRealOffset.getStart());
out.writeVInt((mtasRealOffset.getEnd() - tmpStart));
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PAYLOAD) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PAYLOAD) {
if (mtasPayload != null) {
out.writeVInt(mtasPayload.length);
out.writeBytes(mtasPayload, mtasPayload.length);
} else {
out.writeVInt(0);
}
}
out.writeVLong(termRef);
} // storage token
}
return mtasId;
} catch (Exception e) {
log.error(e);
throw new IOException(e);
}
}
/**
* Store tree.
*
* @param tree
* the tree
* @param out
* the out
* @param refApproxOffset
* the ref approx offset
* @return the long
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Long storeTree(MtasTree<?> tree, IndexOutput out,
long refApproxOffset) throws IOException {
return storeTree(tree.close(), tree.isSinglePoint(),
tree.isStorePrefixAndTermRef(), out, null, refApproxOffset);
}
/**
* Store tree.
*
* @param node
* the node
* @param isSinglePoint
* the is single point
* @param storeAdditionalInformation
* the store additional information
* @param out
* the out
* @param nodeRefApproxOffset
* the node ref approx offset
* @param refApproxOffset
* the ref approx offset
* @return the long
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private Long storeTree(MtasTreeNode<?> node, boolean isSinglePoint,
boolean storeAdditionalInformation, IndexOutput out,
Long nodeRefApproxOffset, long refApproxOffset) throws IOException {
Long localNodeRefApproxOffset = nodeRefApproxOffset;
if (node != null) {
Boolean isRoot = false;
if (localNodeRefApproxOffset == null) {
localNodeRefApproxOffset = out.getFilePointer();
isRoot = true;
}
Long fpIndexObjectPositionLeftChild;
Long fpIndexObjectPositionRightChild;
if (node.leftChild != null) {
fpIndexObjectPositionLeftChild = storeTree(node.leftChild,
isSinglePoint, storeAdditionalInformation, out,
localNodeRefApproxOffset, refApproxOffset);
} else {
fpIndexObjectPositionLeftChild = (long) 0; // tmp
}
if (node.rightChild != null) {
fpIndexObjectPositionRightChild = storeTree(node.rightChild,
isSinglePoint, storeAdditionalInformation, out,
localNodeRefApproxOffset, refApproxOffset);
} else {
fpIndexObjectPositionRightChild = (long) 0; // tmp
}
Long fpIndexObjectPosition = out.getFilePointer();
if (node.leftChild == null) {
fpIndexObjectPositionLeftChild = fpIndexObjectPosition;
}
if (node.rightChild == null) {
fpIndexObjectPositionRightChild = fpIndexObjectPosition;
}
if (isRoot) {
assert localNodeRefApproxOffset >= 0 : "nodeRefApproxOffset < 0 : "
+ localNodeRefApproxOffset;
out.writeVLong(localNodeRefApproxOffset);
byte flag = 0;
if (isSinglePoint) {
flag |= MtasTree.SINGLE_POSITION_TREE;
}
if (storeAdditionalInformation) {
flag |= MtasTree.STORE_ADDITIONAL_ID;
}
out.writeByte(flag);
}
assert node.left >= 0 : "node.left < 0 : " + node.left;
out.writeVInt(node.left);
assert node.right >= 0 : "node.right < 0 : " + node.right;
out.writeVInt(node.right);
assert node.max >= 0 : "node.max < 0 : " + node.max;
out.writeVInt(node.max);
assert fpIndexObjectPositionLeftChild >= localNodeRefApproxOffset : "fpIndexObjectPositionLeftChild<nodeRefApproxOffset : "
+ fpIndexObjectPositionLeftChild + " and " + localNodeRefApproxOffset;
out.writeVLong(
(fpIndexObjectPositionLeftChild - localNodeRefApproxOffset));
assert fpIndexObjectPositionRightChild >= localNodeRefApproxOffset : "fpIndexObjectPositionRightChild<nodeRefApproxOffset"
+ fpIndexObjectPositionRightChild + " and "
+ localNodeRefApproxOffset;
out.writeVLong(
(fpIndexObjectPositionRightChild - localNodeRefApproxOffset));
if (!isSinglePoint) {
out.writeVInt(node.ids.size());
}
HashMap<Integer, MtasTreeNodeId> ids = node.ids;
Long objectRefCorrected;
long objectRefCorrectedPrevious = 0;
// sort refs
List<MtasTreeNodeId> nodeIds = new ArrayList<>(ids.values());
Collections.sort(nodeIds);
if (isSinglePoint && (nodeIds.size() != 1)) {
throw new IOException("singlePoint tree, but missing single point...");
}
int counter = 0;
for (MtasTreeNodeId nodeId : nodeIds) {
counter++;
objectRefCorrected = (nodeId.ref - refApproxOffset);
assert objectRefCorrected >= objectRefCorrectedPrevious : "objectRefCorrected<objectRefCorrectedPrevious : "
+ objectRefCorrected + " and " + objectRefCorrectedPrevious;
out.writeVLong((objectRefCorrected - objectRefCorrectedPrevious));
objectRefCorrectedPrevious = objectRefCorrected;
if (storeAdditionalInformation) {
assert nodeId.additionalId >= 0 : "nodeId.additionalId < 0 for item "
+ counter + " : " + nodeId.additionalId;
out.writeVInt(nodeId.additionalId);
assert nodeId.additionalRef >= 0 : "nodeId.additionalRef < 0 for item "
+ counter + " : " + nodeId.additionalRef;
out.writeVLong(nodeId.additionalRef);
}
}
return fpIndexObjectPosition;
} else {
return null;
}
}
/**
* Token stats add.
*
* @param min
* the min
* @param max
* the max
*/
private void tokenStatsAdd(int min, int max) {
tokenStatsNumber++;
if (tokenStatsMinPos == null) {
tokenStatsMinPos = min;
} else {
tokenStatsMinPos = Math.min(tokenStatsMinPos, min);
}
if (tokenStatsMaxPos == null) {
tokenStatsMaxPos = max;
} else {
tokenStatsMaxPos = Math.max(tokenStatsMaxPos, max);
}
}
/**
* Copy object and update stats.
*
* @param id
* the id
* @param in
* the in
* @param inRef
* the in ref
* @param out
* the out
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private void copyObjectAndUpdateStats(int id, IndexInput in, Long inRef,
IndexOutput out) throws IOException {
int mtasId;
int objectFlags;
// read
in.seek(inRef);
mtasId = in.readVInt();
assert id == mtasId : "wrong id detected while copying object";
objectFlags = in.readVInt();
out.writeVInt(mtasId);
out.writeVInt(objectFlags);
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PARENT) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PARENT) {
out.writeVInt(in.readVInt());
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_RANGE) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_RANGE) {
int minPos = in.readVInt();
int maxPos = in.readVInt();
out.writeVInt(minPos);
out.writeVInt(maxPos);
tokenStatsAdd(minPos, maxPos);
} else if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_SET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_POSITION_SET) {
int size = in.readVInt();
out.writeVInt(size);
SortedSet<Integer> list = new TreeSet<>();
int previousPosition = 0;
for (int t = 0; t < size; t++) {
int pos = in.readVInt();
out.writeVInt(pos);
previousPosition = (pos + previousPosition);
list.add(previousPosition);
}
assert list.size() == size : "duplicate positions in set are not allowed";
tokenStatsAdd(list.first(), list.last());
} else {
int pos = in.readVInt();
out.writeVInt(pos);
tokenStatsAdd(pos, pos);
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_OFFSET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_OFFSET) {
out.writeVInt(in.readVInt());
out.writeVInt(in.readVInt());
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_REALOFFSET) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_REALOFFSET) {
out.writeVInt(in.readVInt());
out.writeVInt(in.readVInt());
}
if ((objectFlags
& MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PAYLOAD) == MtasCodecPostingsFormat.MTAS_OBJECT_HAS_PAYLOAD) {
int length = in.readVInt();
out.writeVInt(length);
byte[] payload = new byte[length];
in.readBytes(payload, 0, length);
out.writeBytes(payload, payload.length);
}
out.writeVLong(in.readVLong());
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.codecs.FieldsConsumer#close()
*/
@Override
public void close() throws IOException {
delegateFieldsConsumer.close();
}
}