public class BKDWriter
extends java.lang.Object
implements java.io.Closeable
maxPointsInLeafNode
. The tree is
fully balanced, which means the leaf nodes will have between 50% and 100% of
the requested maxPointsInLeafNode
. Values that fall exactly
on a cell boundary may be in either cell.
The number of dimensions can be 1 to 255, but every byte[] value is fixed length.
See this paper for details.
This consumes heap during writing: it allocates a LongBitSet(numPoints)
,
and then uses up to the specified maxMBSortInHeap
heap space for writing.
NOTE: This can write at most Integer.MAX_VALUE * maxPointsInLeafNode
total points, and
Modifier and Type | Class and Description |
---|---|
private static class |
BKDWriter.BKDMergeQueue |
private static class |
BKDWriter.MergeReader |
private static class |
BKDWriter.PathSlice
Sliced reference to points in an OfflineSorter.ByteSequencesWriter file.
|
Modifier and Type | Field and Description |
---|---|
protected int |
bytesPerDim
How many bytes each value in each dimension takes.
|
private int |
bytesPerDoc
How many bytes each docs takes in the fixed-width offline format
|
static java.lang.String |
CODEC_NAME |
(package private) int[] |
commonPrefixLengths |
static float |
DEFAULT_MAX_MB_SORT_IN_HEAP
Default maximum heap to use, before spilling to (slower) disk
|
static int |
DEFAULT_MAX_POINTS_IN_LEAF_NODE
Default maximum number of point in each leaf block
|
protected FixedBitSet |
docsSeen |
private HeapPointWriter |
heapPointWriter |
protected boolean |
longOrds
true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes)
|
static int |
MAX_DIMS
Maximum number of dimensions
|
private int |
maxDoc |
(package private) double |
maxMBSortInHeap |
protected byte[] |
maxPackedValue
Maximum per-dim values, packed
|
protected int |
maxPointsInLeafNode |
private int |
maxPointsSortInHeap |
protected byte[] |
minPackedValue
Minimum per-dim values, packed
|
protected int |
numDims
How many dimensions we are indexing
|
private OfflinePointWriter |
offlinePointWriter |
protected OfflineSorter.BufferSize |
offlineSorterBufferMB
How much heap OfflineSorter is allowed to use
|
protected int |
offlineSorterMaxTempFiles
How much heap OfflineSorter is allowed to use
|
protected int |
packedBytesLength
numDims * bytesPerDim
|
protected long |
pointCount |
(package private) byte[] |
scratch1 |
(package private) byte[] |
scratch2 |
(package private) BytesRef |
scratchBytesRef |
(package private) byte[] |
scratchDiff |
protected boolean |
singleValuePerDoc
True if every document has at most one value.
|
(package private) TrackingDirectoryWrapper |
tempDir |
(package private) java.lang.String |
tempFileNamePrefix |
private IndexOutput |
tempInput |
private long |
totalPointCount
An upper bound on how many points the caller will add (includes deletions)
|
static int |
VERSION_CURRENT |
static int |
VERSION_START |
Modifier | Constructor and Description |
---|---|
|
BKDWriter(int maxDoc,
Directory tempDir,
java.lang.String tempFileNamePrefix,
int numDims,
int bytesPerDim,
int maxPointsInLeafNode,
double maxMBSortInHeap,
long totalPointCount,
boolean singleValuePerDoc) |
protected |
BKDWriter(int maxDoc,
Directory tempDir,
java.lang.String tempFileNamePrefix,
int numDims,
int bytesPerDim,
int maxPointsInLeafNode,
double maxMBSortInHeap,
long totalPointCount,
boolean singleValuePerDoc,
boolean longOrds,
long offlineSorterBufferMB,
int offlineSorterMaxTempFiles) |
Modifier and Type | Method and Description |
---|---|
void |
add(byte[] packedValue,
int docID) |
private void |
build(int nodeID,
int leafNodeOffset,
BKDWriter.PathSlice[] slices,
LongBitSet ordBitSet,
IndexOutput out,
byte[] minPackedValue,
byte[] maxPackedValue,
byte[] splitPackedValues,
long[] leafBlockFPs,
java.util.List<java.io.Closeable> toCloseHeroically)
The array (sized numDims) of PathSlice describe the cell we have currently recursed to.
|
private void |
checkMaxLeafNodeCount(int numLeaves) |
void |
close() |
long |
finish(IndexOutput out)
Writes the BKD tree to the provided
IndexOutput and returns the file offset where index was written. |
long |
getPointCount()
How many points have been added so far
|
(package private) PointWriter |
getPointWriter(long count,
java.lang.String desc) |
private byte[] |
markRightTree(long rightCount,
int splitDim,
BKDWriter.PathSlice source,
LongBitSet ordBitSet)
Marks bits for the ords (points) that belong in the right sub tree (those docs that have values >= the splitValue).
|
long |
merge(IndexOutput out,
java.util.List<MergeState.DocMap> docMaps,
java.util.List<BKDReader> readers,
java.util.List<java.lang.Integer> docIDBases)
More efficient bulk-add for incoming
BKDReader s. |
private void |
rotateToTree(int nodeID,
int offset,
int count,
byte[] index,
java.util.List<byte[]> leafBlockStartValues) |
private PointWriter |
sort(int dim) |
private void |
sortHeapPointWriter(HeapPointWriter writer,
int dim)
Sort the heap writer by the specified dim
|
private void |
spillToOffline()
If the current segment has too many points then we spill over to temp files / offline sort.
|
protected int |
split(byte[] minPackedValue,
byte[] maxPackedValue) |
private BKDWriter.PathSlice |
switchToHeap(BKDWriter.PathSlice source,
java.util.List<java.io.Closeable> toCloseHeroically)
Pull a partition back into heap once the point count is low enough while recursing.
|
private boolean |
valueInBounds(BytesRef packedValue,
byte[] minPackedValue,
byte[] maxPackedValue)
Called only in assert
|
private boolean |
valueInOrder(long ord,
byte[] lastPackedValue,
byte[] packedValue,
int packedValueOffset) |
private void |
verifyChecksum(java.lang.Throwable priorException,
PointWriter writer)
Called on exception, to check whether the checksum is also corrupt in this source, and add that
information (checksum matched or didn't) as a suppressed exception.
|
static void |
verifyParams(int numDims,
int maxPointsInLeafNode,
double maxMBSortInHeap,
long totalPointCount) |
protected void |
writeCommonPrefixes(IndexOutput out,
int[] commonPrefixes,
byte[] packedValue) |
protected void |
writeIndex(IndexOutput out,
long[] leafBlockFPs,
byte[] splitPackedValues)
Subclass can change how it writes the index.
|
protected void |
writeLeafBlockDocs(IndexOutput out,
int[] docIDs,
int start,
int count) |
protected void |
writeLeafBlockPackedValue(IndexOutput out,
int[] commonPrefixLengths,
byte[] bytes,
int offset) |
public static final java.lang.String CODEC_NAME
public static final int VERSION_START
public static final int VERSION_CURRENT
private final int bytesPerDoc
public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE
public static final float DEFAULT_MAX_MB_SORT_IN_HEAP
public static final int MAX_DIMS
protected final int numDims
protected final int bytesPerDim
protected final int packedBytesLength
final TrackingDirectoryWrapper tempDir
final java.lang.String tempFileNamePrefix
final double maxMBSortInHeap
final byte[] scratchDiff
final byte[] scratch1
final byte[] scratch2
final BytesRef scratchBytesRef
final int[] commonPrefixLengths
protected final FixedBitSet docsSeen
private OfflinePointWriter offlinePointWriter
private HeapPointWriter heapPointWriter
private IndexOutput tempInput
protected final int maxPointsInLeafNode
private final int maxPointsSortInHeap
protected final byte[] minPackedValue
protected final byte[] maxPackedValue
protected long pointCount
protected final boolean longOrds
private final long totalPointCount
protected final boolean singleValuePerDoc
protected final OfflineSorter.BufferSize offlineSorterBufferMB
protected final int offlineSorterMaxTempFiles
private final int maxDoc
public BKDWriter(int maxDoc, Directory tempDir, java.lang.String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc) throws java.io.IOException
java.io.IOException
protected BKDWriter(int maxDoc, Directory tempDir, java.lang.String tempFileNamePrefix, int numDims, int bytesPerDim, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc, boolean longOrds, long offlineSorterBufferMB, int offlineSorterMaxTempFiles) throws java.io.IOException
java.io.IOException
public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount)
private void spillToOffline() throws java.io.IOException
java.io.IOException
public void add(byte[] packedValue, int docID) throws java.io.IOException
java.io.IOException
public long getPointCount()
public long merge(IndexOutput out, java.util.List<MergeState.DocMap> docMaps, java.util.List<BKDReader> readers, java.util.List<java.lang.Integer> docIDBases) throws java.io.IOException
BKDReader
s. This does a merge sort of the already
sorted values and currently only works when numDims==1. This returns -1 if all documents containing
dimensional values were deleted.java.io.IOException
private void rotateToTree(int nodeID, int offset, int count, byte[] index, java.util.List<byte[]> leafBlockStartValues)
private void sortHeapPointWriter(HeapPointWriter writer, int dim)
private PointWriter sort(int dim) throws java.io.IOException
java.io.IOException
private void checkMaxLeafNodeCount(int numLeaves)
public long finish(IndexOutput out) throws java.io.IOException
IndexOutput
and returns the file offset where index was written.java.io.IOException
protected void writeIndex(IndexOutput out, long[] leafBlockFPs, byte[] splitPackedValues) throws java.io.IOException
java.io.IOException
protected void writeLeafBlockDocs(IndexOutput out, int[] docIDs, int start, int count) throws java.io.IOException
java.io.IOException
protected void writeLeafBlockPackedValue(IndexOutput out, int[] commonPrefixLengths, byte[] bytes, int offset) throws java.io.IOException
java.io.IOException
protected void writeCommonPrefixes(IndexOutput out, int[] commonPrefixes, byte[] packedValue) throws java.io.IOException
java.io.IOException
public void close() throws java.io.IOException
close
in interface java.io.Closeable
close
in interface java.lang.AutoCloseable
java.io.IOException
private void verifyChecksum(java.lang.Throwable priorException, PointWriter writer) throws java.io.IOException
java.io.IOException
private byte[] markRightTree(long rightCount, int splitDim, BKDWriter.PathSlice source, LongBitSet ordBitSet) throws java.io.IOException
java.io.IOException
private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue)
protected int split(byte[] minPackedValue, byte[] maxPackedValue)
private BKDWriter.PathSlice switchToHeap(BKDWriter.PathSlice source, java.util.List<java.io.Closeable> toCloseHeroically) throws java.io.IOException
java.io.IOException
private void build(int nodeID, int leafNodeOffset, BKDWriter.PathSlice[] slices, LongBitSet ordBitSet, IndexOutput out, byte[] minPackedValue, byte[] maxPackedValue, byte[] splitPackedValues, long[] leafBlockFPs, java.util.List<java.io.Closeable> toCloseHeroically) throws java.io.IOException
java.io.IOException
private boolean valueInOrder(long ord, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset)
PointWriter getPointWriter(long count, java.lang.String desc) throws java.io.IOException
java.io.IOException