|
Public Methods |
| IndriIndex (size_t memorySize=INDRI_DEFAULT_MEMORY_SIZE, float queryProportion=INDRI_DEFAULT_QUERY_PROPORTION) |
| ~IndriIndex () |
void | setName (const std::string &prefix) |
| sets the name for this index
|
DOCID_T | addDocument (struct ParsedDocument *document) |
DOCID_T | addDocument (const char *documentName, const greedy_vector< char * > &words, const greedy_vector< TagExtent > &tagExtents) |
DocInfoList * | docInfoList (TERMID_T termID) const |
| doc entries in a term index, - See also:
-
DocList
|
DocPositionInfoList * | docPositionInfoList (TERMID_T termID) |
| doc entries in a term index with positions
|
indri::index::DocListFrequencyIterator * | docFrequencyInfoList (TERMID_T termID) |
| doc entries in a term index without positions
|
TermInfoList * | termInfoList (DOCID_T docID) const |
| word entries in a document index (bag of words), - See also:
-
TermList
|
TermInfoList * | termInfoListSeq (DOCID_T docID) const |
| word entries in a document index (sequence of words), - See also:
-
TermList
|
indri::index::TermListBuilder * | termPositionList (DOCID_T docID) |
| internal IndriIndex term list representation
|
indri::index::FieldListIterator * | fieldPositionListIterator (int fieldID) |
| field list
|
|
bool | open (const std::string &indexName) |
| Open previously created Index with given prefix.
|
bool | open (const char *indexName) |
| Open previously created Index with given prefix.
|
bool | openRead (const std::string &indexName) |
| Open previously created Index with given prefix in read only mode.
|
bool | create (const std::string &indexName) |
| Create a new index with the given prefix.
|
bool | create (const std::string &indexName, const std::vector< FieldDescription > &fields) |
| Create a new index with the given prefix and tag set.
|
void | close () |
| Close the index.
|
|
TERMID_T | term (const TERM_T &word) const |
| Convert a term spelling to a termID.
|
const TERM_T | term (TERMID_T termID) const |
| Convert a termID to its spelling.
|
DOCID_T | document (const EXDOCID_T &docIDStr) const |
| Convert a spelling to docID.
|
const EXDOCID_T | document (DOCID_T docID) const |
| Convert a docID to its spelling.
|
const char * | field (int fieldID) |
| Convert a fieldID to its name.
|
int | field (const char *fieldName) |
| Convert a field name to its fieldID.
|
int | field (const std::string &fieldName) |
| Convert a field name to its fieldID.
|
|
COUNT_T | docCount () const |
| Total count (i.e., number) of documents in collection.
|
COUNT_T | termCountUnique () const |
| Total count of unique terms in collection.
|
INT64 | termCount (TERMID_T termID) const |
| Total counts of a term in collection.
|
INT64 | termCount () const |
| Total counts of all terms in collection.
|
INT64 | fieldTermCount (int fieldID, TERMID_T termID) const |
| Total counts of a term in a field.
|
INT64 | fieldTermCount (int fieldID) const |
| Total counts of all terms in a field.
|
INT64 | fieldDocCount (int fieldID) const |
| Total count of documents that contain a given field.
|
INT64 | fieldDocCount (int fieldID, TERMID_T termID) const |
| Total count of documents that contain a given term in a given field.
|
double | docLengthAvg () const |
| Average document length.
|
COUNT_T | docCount (TERMID_T termID) const |
| Total counts of doc with a given term.
|
COUNT_T | docIndexedLength (DOCID_T documentID) const |
| return indexed length of the document
|
COUNT_T | docLength (DOCID_T documentID) const |
| return length of the document
|
int | termMaxDocumentFrequency (TERMID_T termID) |
| Maximum number of times this term is in any documents.
|
int | termMinDocumentLength (TERMID_T termID) |
| Minimum length of any document containing this term.
|
double | termMaxDocumentFraction (TERMID_T termID) |
| Argmax over documents of (termCount/documentLength).
|
int | maxDocumentLength () |
| Maximum length of any document in the corpus.
|
Protected Methods |
void | _writeCache () |
void | _writeAndMerge () |
void | _writeBatchSegment () |
void | _mergeBatch () |
void | _mergeBatchSegments (int start, int end, int newNumber, bool finalMerge) |
void | _mergeBatchTermLists (const std::vector< int > &segmentMapping) |
void | _writeIncrementalSegment () |
void | _mergeIncrementalSegments () |
void | _readTermMapping (greedy_vector< int > &mapping, int segment, int secondSegment) |
void | _openMergeFiles (int startSegment, int endSegment, std::vector< File * > &listFiles, std::vector< File * > &statsFiles, std::vector< File * > &mappingFiles, std::vector< WriteBuffer * > &mappingBuffers, std::vector< ReadBuffer * > &statsBuffers, std::vector< indri::index::DocListFileIterator * > &listIterators, std::vector< char * > &terms, std::vector< indri::index::TermData * > &termDatas, bool finalMerge) |
void | _openDBs () |
void | _openReadOnlyDBs () |
void | _openSegments () |
void | _createDBs () |
void | _createFields (const std::vector< FieldDescription > &fieldNames) |
void | _closeFields () |
indri::index::DocumentData | fetchDocumentData (int key) const |
int | fetchDocumentLength (int key) const |
void | _updateTermlist (TERMID_T termID, int position) |
int | _updateTermData (int documentLength) |
size_t | _cacheSize () |
void | _computeMemoryBounds (size_t memorySize, float queryProportion) |
void | _resetEstimatePoint () |
indri::index::TermData * | _createTermData () |
indri::index::TermData * | _fetchTermData (TERMID_T termID) |
indri::index::TermData * | _lookupTermData (TERMID_T termID) |
void | _cleanCache () |
void | _deleteTermData (indri::index::TermData *termData) |
size_t | _sizeTermData () |
void | _clearTermData () |
void | _clearTermCache () |
void | _storeTermCache (const char *term, TERMID_T termID, indri::index::TermData *&termData) |
void | _flushTermStatistics (TERMID_T termID, const indri::index::TermFieldStatistics &statistics) |
void | _addTermDataToBuilder (indri::index::DocListDiskBuilder &builder, indri::index::DocListFileIterator &iterator, int writingID, int readingID) |
void | _addOpenTags (greedy_vector< indri::index::FieldExtent > &indexedTags, greedy_vector< indri::index::FieldExtent > &openTags, const greedy_vector< TagExtent > &extents, unsigned int &extentIndex, unsigned int position) |
void | _removeClosedTags (greedy_vector< indri::index::FieldExtent > &tags, unsigned int position) |
void | _lookupTerm (const char *term, TERMID_T &termID, indri::index::TermData *&termData) |
void | _finishDocument (greedy_vector< indri::index::TermFieldStatistics * > &seenStatistics) |
void | _writeDocumentTermList (File::offset_type &offset, int &byteLength, DOCID_T documentID, int documentLength, indri::index::TermListBuilder &locatedTerms) |
void | _writeDocumentStatistics (File::offset_type offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms) |
void | _handleCache () |
int | _lookupTag (const char *tag) |
bool | _readTermData (TERMID_T &termID, char *termBuffer, indri::index::TermData *termData, ReadBuffer *termDataFile) |
void | _incrementalWriteTermData (TERMID_T termID, indri::index::TermData *termData) |
void | _batchWriteTermData (TERMID_T termID, indri::index::TermData *termData, WriteBuffer *file) |
int | _compressTermData (char *buffer, int size, indri::index::TermData *termData) |
void | _decompressTermData (const char *buffer, int size, indri::index::TermData *termData) |
void | _writeParameters (const std::string &fileName) |
bool | _readParameters (const std::string &fileName) |
void | _openDocumentFiles () |
std::string | _buildFileName (const char *suffix) |
std::string | _buildFileName (const char *suffix, int index) |
Protected Attributes |
bool | _readOnly |
indri::index::CorpusStatistics | _corpusStatistics |
std::vector< indri::index::FieldData * > | _fieldData |
std::map< const char *, int,
less_string > | _fieldLookup |
std::string | _baseName |
bool | _writingDocTermLists |
| the prefix name
|
Keyfile | _termDataStore |
KeyfileWordMap | _documentMap |
KeyfileWordMap | _termMap |
File * | _documentStatisticsFile |
File | _documentLengthFile |
std::vector< File * > | _segments |
int | _batchSegmentCount |
File * | _documentTermLocationsFile |
indri::index::TermListBuilder | _termList |
Buffer | _termListBuffer |
greedy_vector< indri::index::TermData * > | _seenTerms |
HashTable< int, indri::index::TermData * > * | _termDataTable |
HashTable< const char *, term_cache_entry * > * | _cache |
| in memory storage of data relating to terms -- partial inverted lists and statistics
|
ReadBuffer * | _documentStatisticsBuffer |
ReadBuffer * | _documentLengthBuffer |
size_t | _listsSize |
size_t | _memorySize |
size_t | _termDataSize |
size_t | _termCacheSize |
size_t | _statisticsBufferSize |
size_t | _lengthBufferSize |
float | _queryProportion |
bool | _batchBuild |
INT64 | _estimatePoint |
INT64 | _lastCacheFlush |
| number of terms in the index when we should next check on flushing the inverted lists
|