|
Public Methods |
| | IndriIndex (size_t memorySize=INDRI_DEFAULT_MEMORY_SIZE, float queryProportion=INDRI_DEFAULT_QUERY_PROPORTION) |
| | ~IndriIndex () |
| void | setName (const std::string &prefix) |
| | sets the name for this index
|
| DOCID_T | addDocument (struct ParsedDocument *document) |
| DOCID_T | addDocument (const char *documentName, const greedy_vector< char * > &words, const greedy_vector< TagExtent > &tagExtents) |
| DocInfoList * | docInfoList (TERMID_T termID) const |
| | doc entries in a term index, - See also:
-
DocList
|
| DocPositionInfoList * | docPositionInfoList (TERMID_T termID) |
| | doc entries in a term index with positions
|
| indri::index::DocListFrequencyIterator * | docFrequencyInfoList (TERMID_T termID) |
| | doc entries in a term index without positions
|
| TermInfoList * | termInfoList (DOCID_T docID) const |
| | word entries in a document index (bag of words), - See also:
-
TermList
|
| TermInfoList * | termInfoListSeq (DOCID_T docID) const |
| | word entries in a document index (sequence of words), - See also:
-
TermList
|
| indri::index::TermListBuilder * | termPositionList (DOCID_T docID) |
| | internal IndriIndex term list representation
|
| indri::index::FieldListIterator * | fieldPositionListIterator (int fieldID) |
| | field list
|
|
| bool | open (const std::string &indexName) |
| | Open previously created Index with given prefix.
|
| bool | open (const char *indexName) |
| | Open previously created Index with given prefix.
|
| bool | openRead (const std::string &indexName) |
| | Open previously created Index with given prefix in read only mode.
|
| bool | create (const std::string &indexName) |
| | Create a new index with the given prefix.
|
| bool | create (const std::string &indexName, const std::vector< FieldDescription > &fields) |
| | Create a new index with the given prefix and tag set.
|
| void | close () |
| | Close the index.
|
|
| TERMID_T | term (const TERM_T &word) const |
| | Convert a term spelling to a termID.
|
| const TERM_T | term (TERMID_T termID) const |
| | Convert a termID to its spelling.
|
| DOCID_T | document (const EXDOCID_T &docIDStr) const |
| | Convert a spelling to docID.
|
| const EXDOCID_T | document (DOCID_T docID) const |
| | Convert a docID to its spelling.
|
| const char * | field (int fieldID) |
| | Convert a fieldID to its name.
|
| int | field (const char *fieldName) |
| | Convert a field name to its fieldID.
|
| int | field (const std::string &fieldName) |
| | Convert a field name to its fieldID.
|
|
| COUNT_T | docCount () const |
| | Total count (i.e., number) of documents in collection.
|
| COUNT_T | termCountUnique () const |
| | Total count of unique terms in collection.
|
| INT64 | termCount (TERMID_T termID) const |
| | Total counts of a term in collection.
|
| INT64 | termCount () const |
| | Total counts of all terms in collection.
|
| INT64 | fieldTermCount (int fieldID, TERMID_T termID) const |
| | Total counts of a term in a field.
|
| INT64 | fieldTermCount (int fieldID) const |
| | Total counts of all terms in a field.
|
| INT64 | fieldDocCount (int fieldID) const |
| | Total count of documents that contain a given field.
|
| INT64 | fieldDocCount (int fieldID, TERMID_T termID) const |
| | Total count of documents that contain a given term in a given field.
|
| double | docLengthAvg () const |
| | Average document length.
|
| COUNT_T | docCount (TERMID_T termID) const |
| | Total counts of doc with a given term.
|
| COUNT_T | docIndexedLength (DOCID_T documentID) const |
| | return indexed length of the document
|
| COUNT_T | docLength (DOCID_T documentID) const |
| | return length of the document
|
| int | termMaxDocumentFrequency (TERMID_T termID) |
| | Maximum number of times this term is in any documents.
|
| int | termMinDocumentLength (TERMID_T termID) |
| | Minimum length of any document containing this term.
|
| double | termMaxDocumentFraction (TERMID_T termID) |
| | Argmax over documents of (termCount/documentLength).
|
| int | maxDocumentLength () |
| | Maximum length of any document in the corpus.
|
Protected Methods |
| void | _writeCache () |
| void | _writeAndMerge () |
| void | _writeBatchSegment () |
| void | _mergeBatch () |
| void | _mergeBatchSegments (int start, int end, int newNumber, bool finalMerge) |
| void | _mergeBatchTermLists (const std::vector< int > &segmentMapping) |
| void | _writeIncrementalSegment () |
| void | _mergeIncrementalSegments () |
| void | _readTermMapping (greedy_vector< int > &mapping, int segment, int secondSegment) |
| void | _openMergeFiles (int startSegment, int endSegment, std::vector< File * > &listFiles, std::vector< File * > &statsFiles, std::vector< File * > &mappingFiles, std::vector< WriteBuffer * > &mappingBuffers, std::vector< ReadBuffer * > &statsBuffers, std::vector< indri::index::DocListFileIterator * > &listIterators, std::vector< char * > &terms, std::vector< indri::index::TermData * > &termDatas, bool finalMerge) |
| void | _openDBs () |
| void | _openReadOnlyDBs () |
| void | _openSegments () |
| void | _createDBs () |
| void | _createFields (const std::vector< FieldDescription > &fieldNames) |
| void | _closeFields () |
| indri::index::DocumentData | fetchDocumentData (int key) const |
| int | fetchDocumentLength (int key) const |
| void | _updateTermlist (TERMID_T termID, int position) |
| int | _updateTermData (int documentLength) |
| size_t | _cacheSize () |
| void | _computeMemoryBounds (size_t memorySize, float queryProportion) |
| void | _resetEstimatePoint () |
| indri::index::TermData * | _createTermData () |
| indri::index::TermData * | _fetchTermData (TERMID_T termID) |
| indri::index::TermData * | _lookupTermData (TERMID_T termID) |
| void | _cleanCache () |
| void | _deleteTermData (indri::index::TermData *termData) |
| size_t | _sizeTermData () |
| void | _clearTermData () |
| void | _clearTermCache () |
| void | _storeTermCache (const char *term, TERMID_T termID, indri::index::TermData *&termData) |
| void | _flushTermStatistics (TERMID_T termID, const indri::index::TermFieldStatistics &statistics) |
| void | _addTermDataToBuilder (indri::index::DocListDiskBuilder &builder, indri::index::DocListFileIterator &iterator, int writingID, int readingID) |
| void | _addOpenTags (greedy_vector< indri::index::FieldExtent > &indexedTags, greedy_vector< indri::index::FieldExtent > &openTags, const greedy_vector< TagExtent > &extents, unsigned int &extentIndex, unsigned int position) |
| void | _removeClosedTags (greedy_vector< indri::index::FieldExtent > &tags, unsigned int position) |
| void | _lookupTerm (const char *term, TERMID_T &termID, indri::index::TermData *&termData) |
| void | _finishDocument (greedy_vector< indri::index::TermFieldStatistics * > &seenStatistics) |
| void | _writeDocumentTermList (File::offset_type &offset, int &byteLength, DOCID_T documentID, int documentLength, indri::index::TermListBuilder &locatedTerms) |
| void | _writeDocumentStatistics (File::offset_type offset, int byteLength, int indexedLength, int totalLength, int uniqueTerms) |
| void | _handleCache () |
| int | _lookupTag (const char *tag) |
| bool | _readTermData (TERMID_T &termID, char *termBuffer, indri::index::TermData *termData, ReadBuffer *termDataFile) |
| void | _incrementalWriteTermData (TERMID_T termID, indri::index::TermData *termData) |
| void | _batchWriteTermData (TERMID_T termID, indri::index::TermData *termData, WriteBuffer *file) |
| int | _compressTermData (char *buffer, int size, indri::index::TermData *termData) |
| void | _decompressTermData (const char *buffer, int size, indri::index::TermData *termData) |
| void | _writeParameters (const std::string &fileName) |
| bool | _readParameters (const std::string &fileName) |
| void | _openDocumentFiles () |
| std::string | _buildFileName (const char *suffix) |
| std::string | _buildFileName (const char *suffix, int index) |
Protected Attributes |
| bool | _readOnly |
| indri::index::CorpusStatistics | _corpusStatistics |
| std::vector< indri::index::FieldData * > | _fieldData |
std::map< const char *, int,
less_string > | _fieldLookup |
| std::string | _baseName |
| bool | _writingDocTermLists |
| | the prefix name
|
| Keyfile | _termDataStore |
| KeyfileWordMap | _documentMap |
| KeyfileWordMap | _termMap |
| File * | _documentStatisticsFile |
| File | _documentLengthFile |
| std::vector< File * > | _segments |
| int | _batchSegmentCount |
| File * | _documentTermLocationsFile |
| indri::index::TermListBuilder | _termList |
| Buffer | _termListBuffer |
| greedy_vector< indri::index::TermData * > | _seenTerms |
| HashTable< int, indri::index::TermData * > * | _termDataTable |
| HashTable< const char *, term_cache_entry * > * | _cache |
| | in memory storage of data relating to terms -- partial inverted lists and statistics
|
| ReadBuffer * | _documentStatisticsBuffer |
| ReadBuffer * | _documentLengthBuffer |
| size_t | _listsSize |
| size_t | _memorySize |
| size_t | _termDataSize |
| size_t | _termCacheSize |
| size_t | _statisticsBufferSize |
| size_t | _lengthBufferSize |
| float | _queryProportion |
| bool | _batchBuild |
| INT64 | _estimatePoint |
| INT64 | _lastCacheFlush |
| | number of terms in the index when we should next check on flushing the inverted lists
|