00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "Param.hpp"
00030 #include "PushIndex.hpp"
00031 #include "MemCache.hpp"
00032 #include "Keyfile.hpp"
00033 #include "KeyfileDocMgr.hpp"
00034 #include "ReadBuffer.hpp"
00035 #include "WriteBuffer.hpp"
00036 #include "TermCache.hpp"
00037 #include <cstring>
00038 #include <queue>
00039
00040
00041 #define UNIQUE_TERMS 0
00042 #define TOTAL_TERMS 1
00043 #define DOCS 2
00044 #define DT_FILES 3
00045 #define INV_FILES 4
00046
00047 #define MAX_DOCID_LENGTH 512
00048 #define MAX_TERM_LENGTH 512
00049
00050 #define KEYFILE_MAX_SEGMENTS (16)
00051
00052
00053
00066 class KeyfileIncIndex : public PushIndex, public Index {
00067 public:
00069 class record {
00070 public:
00072 File::offset_type offset;
00074 int len;
00076 int totalLen;
00078 int num;
00079 };
00081 struct SegmentOffset {
00083 unsigned int segment;
00085 unsigned int length;
00087 File::offset_type offset;
00088 };
00090 struct TermData {
00092 COUNT_T totalCount;
00094 COUNT_T documentCount;
00096 SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00097 };
00100 KeyfileIncIndex(const string &prefix, int cachesize=128000000,
00101 DOCID_T startdocid=1);
00103 KeyfileIncIndex();
00105 ~KeyfileIncIndex();
00106
00108 void setName(const string &prefix);
00109
00111 bool beginDoc(const DocumentProps* dp);
00112
00114 bool addTerm(const Term& t);
00115
00117 void endDoc(const DocumentProps* dp);
00118
00120 virtual void endDoc(const DocumentProps* dp, const string &mgr);
00121
00123 void endCollection(const CollectionProps* cp);
00124
00126 void setDocManager(const string &mgrID);
00127
00128 protected:
00130 bool tryOpen();
00132 void writeTOC();
00134 void writeCache( bool lastRun = false );
00136 void lastWriteCache();
00137
00139 void mergeCacheSegments();
00141 void writeCacheSegment();
00143 void writeDocMgrIDs();
00146 int docMgrID(const string &mgr);
00148 virtual void doendDoc(const DocumentProps* dp, int mgrid);
00150 int listlengths;
00151
00152 public:
00154
00155
00157 bool open(const string &indexName);
00159
00161
00162
00164 TERMID_T term(const TERM_T &word) const;
00165
00167 const TERM_T term(TERMID_T termID) const;
00168
00170 DOCID_T document(const EXDOCID_T &docIDStr) const;
00171
00173 const EXDOCID_T document(DOCID_T docID) const;
00174
00176 const DocumentManager *docManager(DOCID_T docID) const;
00177
00179
00181
00182
00184 COUNT_T docCount() const { return counts[DOCS]; };
00185
00187 COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; };
00188
00190 COUNT_T termCount(TERMID_T termID) const;
00191
00193 COUNT_T termCount() const { return counts[TOTAL_TERMS]; };
00194
00196 float docLengthAvg() const;
00197
00199 COUNT_T docCount(TERMID_T termID) const;
00200
00202 COUNT_T docLength(DOCID_T docID) const;
00203
00205 virtual COUNT_T totaldocLength (DOCID_T docID) const;
00206
00208 COUNT_T docLengthCounted(DOCID_T docID) const;
00209
00211
00213
00214
00215 DocInfoList* docInfoList(TERMID_T termID) const;
00216
00218 TermInfoList* termInfoList(DOCID_T docID) const;
00220 TermInfoList* termInfoListSeq(DOCID_T docID) const;
00221
00223
00225 void setMesgStream(ostream * lemStream);
00227 void addKnownTerm( TERMID_T termID, LOC_T position );
00229 TERMID_T addUnknownTerm( const InvFPTerm* term );
00231 TERMID_T addUncachedTerm( const InvFPTerm* term );
00232
00233 protected:
00235 void openDBs();
00237 void openSegments();
00239 void createDBs();
00240
00242 void fullToc();
00244 bool docMgrIDs();
00246 record fetchDocumentRecord( DOCID_T key ) const;
00248 void addDocumentLookup( DOCID_T documentKey, const char* documentName );
00250 void addTermLookup( TERMID_T termKey, const char* termSpelling );
00252 void addGeneralLookup( Keyfile& numberNameIndex, Keyfile& nameNumberIndex,
00253 TERMID_T number, const char* name );
00255 InvFPDocList* internalDocInfoList(TERMID_T termID) const;
00257 void _updateTermlist( InvFPDocList* curlist, LOC_T position );
00259 int _cacheSize();
00261 void _computeMemoryBounds( int memorySize );
00263 void _resetEstimatePoint();
00265 COUNT_T* counts;
00267 std::vector<std::string> names;
00269 float aveDocLen;
00271 vector<std::string> docmgrs;
00273 ostream* msgstream;
00274
00275
00276
00278
00279
00280
00282
00284
00286
00288
00290
00292
00295
00296
00298 mutable char termKey[MAX_TERM_LENGTH];
00300 mutable char docKey[MAX_DOCID_LENGTH];
00302 int _listsSize;
00304 int _memorySize;
00306 std::string name;
00308 vector<InvFPDocList*> invertlists;
00310 vector<LocatedTerm> termlist;
00312 int curdocmgr;
00314 vector<DocumentManager*> docMgrs;
00316 TermCache _cache;
00317
00319 std::vector<File*> _segments;
00321 TERMID_T _largestFlushedTermID;
00323 int _estimatePoint;
00325 bool ignoreDoc;
00326 };
00327
00328
00329 #endif //_LEMUR_KEYFILE_INCINDEX_HPP