7 #ifndef DOCUMENTSWRITER_H 8 #define DOCUMENTSWRITER_H 257 int32_t
flush(
bool _closeDocStore);
318 String
toMB(int64_t v);
375 virtual bool testPoint(
const String& name);
397 virtual ByteArray newBuffer(int32_t size);
414 virtual void finish() = 0;
415 virtual void abort() = 0;
416 virtual int64_t sizeInBytes() = 0;
464 virtual void finish();
465 virtual void abort();
466 virtual int64_t sizeInBytes();
513 virtual ByteArray getByteBlock(
bool trackAllocations);
Definition: ByteBlockPool.h:54
static const int32_t MAX_THREAD_STATE
Max # ThreadState instances; if there are more threads than this they share ThreadStates.
Definition: DocumentsWriter.h:69
void abort()
Called if we hit an exception at a bad time (when updating the index files) and must discard all curr...
InfoStreamPtr infoStream
Definition: DocumentsWriter.h:367
int32_t numDocsInStore
Definition: DocumentsWriter.h:160
void doAfterFlush()
Reset after a flush.
static const int32_t CHAR_BLOCK_MASK
Definition: DocumentsWriter.h:143
void initSegmentName(bool onlyDocStore)
void setMaxBufferedDocs(int32_t count)
Set max buffered docs, which means we will flush by doc count instead of by RAM usage.
String closeDocStore()
Closes the current open doc stores an returns the doc store segment name. This returns null if there ...
void setMaxBufferedDeleteTerms(int32_t maxBufferedDeleteTerms)
void bytesUsed(int64_t numBytes)
static const int32_t BYTE_BLOCK_NOT_MASK
Definition: DocumentsWriter.h:138
String maxTermPrefix
Definition: DocumentsWriter.h:371
void addDeleteTerm(const TermPtr &term, int32_t docCount)
void waitReady(const DocumentsWriterThreadStatePtr &state)
void setFlushedDocCount(int32_t n)
int32_t maxBufferedDeleteTerms
The max number of delete terms that can be buffered before they must be flushed to disk...
Definition: DocumentsWriter.h:85
static IndexingChainPtr getDefaultIndexingChain()
void setInfoStream(const InfoStreamPtr &infoStream)
If non-null, various details of indexing are printed here.
HashSet< String > _closedFiles
Definition: DocumentsWriter.h:172
bool flushPending
Definition: DocumentsWriter.h:162
boost::shared_ptr< InfoStream > InfoStreamPtr
Definition: LuceneTypes.h:532
AnalyzerPtr analyzer
Definition: DocumentsWriter.h:365
BufferedDeletesPtr deletesInRAM
Deletes done after the last flush; these are discarded on abort.
Definition: DocumentsWriter.h:79
boost::shared_ptr< Term > TermPtr
Definition: LuceneTypes.h:233
This class accepts multiple added documents and directly writes a single segment file. It does this more efficiently than creating a single segment per document (with DocumentWriter) and doing standard merges on those segments.
Definition: DocumentsWriter.h:54
boost::shared_ptr< Analyzer > AnalyzerPtr
Definition: LuceneTypes.h:20
static const int32_t OBJECT_HEADER_BYTES
Coarse estimates used to measure RAM usage of buffered deletes.
Definition: DocumentsWriter.h:113
WaitQueuePtr waitQueue
Definition: DocumentsWriter.h:174
int32_t numWaiting
Definition: DocumentsWriter.h:483
int32_t getNumBufferedDeleteTerms()
DocumentsWriterThreadStatePtr getThreadState(const DocumentPtr &doc, const TermPtr &delTerm)
Returns a free (idle) ThreadState that may be used for indexing this one document. This call also pauses if a flush is pending. If delTerm is non-null then we buffer this deleted term after the thread state has been acquired.
static const int32_t CHAR_NUM_BYTE
Definition: DocumentsWriter.h:116
boost::shared_ptr< OneMerge > OneMergePtr
Definition: LuceneTypes.h:192
int64_t numBytesUsed
Definition: DocumentsWriter.h:181
void createCompoundFile(const String &segment)
Build compound file for the segment we just flushed.
boost::shared_ptr< BufferedDeletes > BufferedDeletesPtr
Definition: LuceneTypes.h:87
IndexingChainPtr indexingChain
Definition: DocumentsWriter.h:157
int32_t maxFieldLength
Definition: DocumentsWriter.h:366
static const int32_t PER_DOC_BLOCK_SIZE
Definition: DocumentsWriter.h:152
void recycleIntBlocks(Collection< IntArray > blocks, int32_t start, int32_t end)
bool checkDeleteTerm(const TermPtr &term)
int32_t maxBufferedDocs
Flush @ this number of docs. If ramBufferSize is non-zero we will flush by RAM usage instead...
Definition: DocumentsWriter.h:97
DocumentPtr doc
Definition: DocumentsWriter.h:370
static const int32_t INT_BLOCK_SIZE
Definition: DocumentsWriter.h:149
static const int32_t CHAR_BLOCK_SIZE
Definition: DocumentsWriter.h:142
HashSet< String > closedFiles()
int32_t docStoreOffset
Definition: DocumentsWriter.h:63
void setSimilarity(const SimilarityPtr &similarity)
bool pauseAllThreads()
Returns true if an abort is in progress.
boost::shared_ptr< IndexWriter > IndexWriterPtr
Definition: LuceneTypes.h:160
void bytesAllocated(int64_t numBytes)
double getRAMBufferSizeMB()
IntArray getIntBlock(bool trackAllocations)
HashSet< String > openFiles()
Returns Collection of files in use by this instance, including any flushed segments.
RAMFile buffer for DocWriters.
Definition: DocumentsWriter.h:381
int32_t nextDocID
Definition: DocumentsWriter.h:65
static const int32_t INT_BLOCK_SHIFT
Initial chunks size of the shared int[] blocks used to store postings data.
Definition: DocumentsWriter.h:148
virtual ~DocumentsWriter()
int32_t maxFieldLength
Definition: DocumentsWriter.h:166
boost::shared_ptr< WaitQueue > WaitQueuePtr
Definition: LuceneTypes.h:265
int64_t freeTrigger
If we've allocated 5% over our RAM budget, we then free down to 95%.
Definition: DocumentsWriter.h:93
DocFieldProcessorPtr docFieldProcessor
Definition: DocumentsWriter.h:76
boost::shared_ptr< SkipDocWriter > SkipDocWriterPtr
Definition: LuceneTypes.h:226
SimilarityPtr similarity
Definition: DocumentsWriter.h:167
int64_t waitingBytes
Definition: DocumentsWriter.h:484
boost::shared_ptr< DocumentsWriterThreadState > DocumentsWriterThreadStatePtr
Definition: LuceneTypes.h:124
boost::weak_ptr< DocumentsWriter > DocumentsWriterWeakPtr
Definition: LuceneTypes.h:123
HashSet< String > getFlushedFiles()
BufferedDeletesPtr deletesFlushed
Deletes done before the last flush; these are still kept on abort.
Definition: DocumentsWriter.h:82
void addDeleteDocID(int32_t docID)
Buffer a specific docID for deletion. Currently only used when we hit a exception when adding a docum...
static const int32_t INT_BLOCK_MASK
Definition: DocumentsWriter.h:150
bool bufferDeleteQuery(const QueryPtr &query)
boost::weak_ptr< IndexWriter > IndexWriterWeakPtr
Definition: LuceneTypes.h:160
static const int32_t BYTES_PER_DEL_QUERY
Rough logic: HashMap has an array[Entry] with varying load factor (say 2 * POINTER). Entry is object with Query key, Integer val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT). Query we often undercount (say 24 bytes). Integer is OBJ_HEADER + INT.
Definition: DocumentsWriter.h:132
boost::shared_ptr< SegmentInfos > SegmentInfosPtr
Definition: LuceneTypes.h:210
boost::shared_ptr< DocFieldProcessor > DocFieldProcessorPtr
Definition: LuceneTypes.h:115
boost::shared_ptr< DocumentsWriter > DocumentsWriterPtr
Definition: LuceneTypes.h:123
int32_t nextWriteDocID
Definition: DocumentsWriter.h:481
void addDeleteQuery(const QueryPtr &query, int32_t docID)
void finishDocument(const DocumentsWriterThreadStatePtr &perThread, const DocWriterPtr &docWriter)
Does the synchronized work to finish/flush the inverted document.
HashSet< String > abortedFiles()
boost::shared_ptr< Document > DocumentPtr
Definition: LuceneTypes.h:74
String segment
Definition: DocumentsWriter.h:158
boost::shared_ptr< Query > QueryPtr
Definition: LuceneTypes.h:420
static const int32_t MAX_TERM_LENGTH
Definition: DocumentsWriter.h:145
bool updateDocument(const TermPtr &t, const DocumentPtr &doc, const AnalyzerPtr &analyzer)
void message(const String &message)
void setRAMBufferSizeMB(double mb)
Set how much RAM we can use before flushing.
Definition: DocumentsWriter.h:356
bool closed
Definition: DocumentsWriter.h:102
bool hasProx()
Returns true if any of the fields in the current buffered docs have omitTermFreqAndPositions==false.
boost::shared_ptr< ByteBlockAllocator > ByteBlockAllocatorPtr
Definition: LuceneTypes.h:88
File used as buffer in RAMDirectory.
Definition: RAMFile.h:15
DocConsumerPtr consumer
Definition: DocumentsWriter.h:169
int32_t docID
Definition: DocumentsWriter.h:411
boost::shared_ptr< IndexReader > IndexReaderPtr
Definition: LuceneTypes.h:157
boost::shared_ptr< Directory > DirectoryPtr
Definition: LuceneTypes.h:489
int32_t getNumDocsInRAM()
Returns how many docs are currently buffered in RAM.
int32_t blockSize
Definition: DocumentsWriter.h:508
static const int32_t CHAR_BLOCK_SHIFT
Initial chunk size of the shared char[] blocks used to store term text.
Definition: DocumentsWriter.h:141
ByteBlockAllocatorPtr perDocAllocator
Definition: DocumentsWriter.h:178
Base class for all Lucene classes.
Definition: LuceneObject.h:31
boost::shared_ptr< DocConsumer > DocConsumerPtr
Definition: LuceneTypes.h:106
Collection< ByteArray > freeByteBlocks
Definition: DocumentsWriter.h:509
friend class WaitQueue
Definition: DocumentsWriter.h:353
int64_t ramBufferSize
How much RAM we can use before flushing. This is 0 if we are flushing by doc count instead...
Definition: DocumentsWriter.h:88
int32_t getMaxBufferedDeleteTerms()
String getDocStoreSegment()
Returns the current doc store segment we are writing to.
Collection< DocumentsWriterThreadStatePtr > threadStates
Definition: DocumentsWriter.h:70
DocumentsWriter(const DirectoryPtr &directory, const IndexWriterPtr &writer, const IndexingChainPtr &indexingChain)
void removeOpenFile(const String &name)
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
static const int32_t BYTES_PER_DEL_TERM
Rough logic: HashMap has an array[Entry] with varying load factor (say 2 * POINTER). Entry is object with Term key, BufferedDeletes.Num val, int hash, Entry next (OBJ_HEADER + 3*POINTER + INT). Term is object with String field and String text (OBJ_HEADER + 2*POINTER). We don't count Term's field since it's interned. Term's text is String (OBJ_HEADER + 4*INT + POINTER + OBJ_HEADER + string.length*CHAR). BufferedDeletes.num is OBJ_HEADER + INT.
Definition: DocumentsWriter.h:123
Collection< CharArray > freeCharBlocks
Definition: DocumentsWriter.h:109
Definition: AbstractAllTermDocs.h:12
bool bufferDeleteTerms(Collection< TermPtr > terms)
bool setFlushPending()
Set flushPending if it is not already set and returns whether it was set. This is used by IndexWriter...
TermPtr lastDeleteTerm
Definition: DocumentsWriter.h:184
int32_t flushedDocCount
How many docs already flushed to index.
Definition: DocumentsWriter.h:100
Collection< IntArray > freeIntBlocks
Definition: DocumentsWriter.h:108
bool bufferDeleteTerm(const TermPtr &term)
Definition: DocumentsWriter.h:457
HashSet< String > _openFiles
Definition: DocumentsWriter.h:171
Definition: DocumentsWriter.h:469
int64_t freeLevel
Definition: DocumentsWriter.h:94
SimilarityPtr similarity
Definition: DocumentsWriter.h:368
void recycleCharBlocks(Collection< CharArray > blocks, int32_t numBlocks)
Definition: DocumentsWriter.h:497
void initFlushState(bool onlyDocStore)
static const int32_t BYTES_PER_DEL_DOCID
Rough logic: del docIDs are List<Integer>. Say list allocates ~2X size (2*POINTER). Integer is OBJ_HEADER + int.
Definition: DocumentsWriter.h:127
The IndexingChain must define the getChain(DocumentsWriter) method which returns the DocConsumer that...
Definition: DocumentsWriter.h:423
String docStoreSegment
Definition: DocumentsWriter.h:59
static const int32_t BYTE_BLOCK_MASK
Definition: DocumentsWriter.h:137
int32_t numDocsInRAM
Definition: DocumentsWriter.h:66
MapTermNum getBufferedDeleteTerms()
static const int32_t BYTE_BLOCK_SIZE
Definition: DocumentsWriter.h:136
ByteBlockAllocatorPtr byteBlockAllocator
Definition: DocumentsWriter.h:177
void updateFlushedDocCount(int32_t n)
int32_t flush(bool _closeDocStore)
Flush all pending docs to a new segment.
bool bufferDeleteQueries(Collection< QueryPtr > queries)
void setMaxFieldLength(int32_t maxFieldLength)
int32_t getFlushedDocCount()
int32_t docID
Definition: DocumentsWriter.h:369
int32_t getMaxBufferedDocs()
int64_t waitQueueResumeBytes
Definition: DocumentsWriter.h:90
SkipDocWriterPtr skipDocWriter
Definition: DocumentsWriter.h:175
boost::shared_ptr< IndexingChain > IndexingChainPtr
Definition: LuceneTypes.h:156
PerDocBufferPtr newPerDocBuffer()
Create and return a new DocWriterBuffer.
static const int32_t POINTER_NUM_BYTE
Definition: DocumentsWriter.h:114
bool aborting
Definition: DocumentsWriter.h:74
Consumer returns this on each doc. This holds any state that must be flushed synchronized "in docID o...
Definition: DocumentsWriter.h:402
bool addDocument(const DocumentPtr &doc, const AnalyzerPtr &analyzer)
Returns true if the caller (IndexWriter) should now flush.
boost::shared_ptr< Similarity > SimilarityPtr
Definition: LuceneTypes.h:435
boost::shared_ptr< PerDocBuffer > PerDocBufferPtr
Definition: LuceneTypes.h:199
int64_t numBytesAlloc
Definition: DocumentsWriter.h:180
int32_t pauseThreads
Definition: DocumentsWriter.h:73
SegmentWriteStatePtr flushState
Definition: DocumentsWriter.h:106
void addOpenFile(const String &name)
String getSegment()
Get current segment name we are writing.
bool timeToFlushDeletes()
DirectoryPtr directory
Definition: DocumentsWriter.h:156
void remapDeletes(const SegmentInfosPtr &infos, Collection< Collection< int32_t > > docMaps, Collection< int32_t > delCounts, const OneMergePtr &merge, int32_t mergeDocCount)
Called whenever a merge has completed and the merged segments had deletions.
virtual void initialize()
Called directly after instantiation to create objects that depend on this object being fully construc...
bool applyDeletes(const SegmentInfosPtr &infos)
This is the current indexing chain: DocConsumer / DocConsumerPerThread –> code: DocFieldProcessor / ...
Definition: DocumentsWriter.h:447
MapThreadDocumentsWriterThreadState threadBindings
Definition: DocumentsWriter.h:71
int32_t getDocStoreOffset()
Returns the doc offset into the shared doc store for the current buffered docs.
boost::shared_ptr< SegmentWriteState > SegmentWriteStatePtr
Definition: LuceneTypes.h:222
void balanceRAM()
We have four pools of RAM: Postings, byte blocks (holds freq/prox posting data), char blocks (holds c...
int64_t waitQueuePauseBytes
Definition: DocumentsWriter.h:89
boost::shared_ptr< DocWriter > DocWriterPtr
Definition: LuceneTypes.h:125
static const int32_t INT_NUM_BYTE
Definition: DocumentsWriter.h:115
HashSet< String > _abortedFiles
List of files that were written before last abort()
Definition: DocumentsWriter.h:105
Collection< DocWriterPtr > waiting
Definition: DocumentsWriter.h:480
int32_t nextWriteLoc
Definition: DocumentsWriter.h:482
InfoStreamPtr infoStream
Definition: DocumentsWriter.h:165
bool bufferIsFull
Definition: DocumentsWriter.h:163
static const int32_t BYTE_BLOCK_SHIFT
Initial chunks size of the shared byte[] blocks used to store postings data.
Definition: DocumentsWriter.h:135