Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

KeyfileIncIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _LEMUR_KEYFILE_INCINDEX_HPP
00014 #define _LEMUR_KEYFILE_INCINDEX_HPP
00015 
00016 /*
00017  * NAME DATE - COMMENTS
00018  * tnt 01/02 - created
00019  * dmf 07/03 - converted to incremental berkeley db btree indexer with
00020  * integrated document manager.
00021  * tds 09/03 - modified from BTIncIndex to use keyfile
00022  * dmf 12/03 - update to 2.1 API and remove parser/docmgr components.
00023 */
00024 #include "common_headers.hpp"
00025 #include "Index.hpp"
00026 #include "InvFPDocList.hpp"
00027 #include "InvFPTermList.hpp"
00028 #include "InvFPTypes.hpp"
00029 #include "Param.hpp"
00030 #include "PushIndex.hpp"
00031 #include "MemCache.hpp"
00032 #include "Keyfile.hpp"
00033 #include "KeyfileDocMgr.hpp"
00034 #include "ReadBuffer.hpp"
00035 #include "WriteBuffer.hpp"
00036 #include "TermCache.hpp"
00037 #include <cstring>
00038 #include <queue>
00039 
00040 // for counts array
00041 #define UNIQUE_TERMS 0
00042 #define TOTAL_TERMS  1
00043 #define DOCS         2
00044 #define DT_FILES     3
00045 #define INV_FILES    4
00046 // keyref.h -- 512
00047 #define MAX_DOCID_LENGTH 512
00048 #define MAX_TERM_LENGTH 512
00049 
00050 #define KEYFILE_MAX_SEGMENTS (16)
00051 
00052 // we love multiple inheritance
00053 
00066 class KeyfileIncIndex : public PushIndex, public Index {
00067 public:
00069   class record {
00070   public:
00072     File::offset_type offset;
00074     int len;
00076     int totalLen;
00078     int num;     
00079   };
00081   struct SegmentOffset {
00083     unsigned int segment;
00085     unsigned int length;
00087     File::offset_type offset;
00088   };
00090   struct TermData {
00092     COUNT_T totalCount;
00094     COUNT_T documentCount;
00096     SegmentOffset segments[ KEYFILE_MAX_SEGMENTS ];
00097   };
00100   KeyfileIncIndex(const string &prefix, int cachesize=128000000, 
00101                   DOCID_T startdocid=1);
00103   KeyfileIncIndex();
00105   ~KeyfileIncIndex();
00106 
00108   void setName(const string &prefix);
00109 
00111   bool beginDoc(const DocumentProps* dp);
00112 
00114   bool addTerm(const Term& t);
00115 
00117   void endDoc(const DocumentProps* dp);
00118 
00120   virtual void endDoc(const DocumentProps* dp, const string &mgr);
00121 
00123   void endCollection(const CollectionProps* cp);
00124 
00126   void setDocManager(const string &mgrID);
00127     
00128 protected:
00130   bool tryOpen();
00132   void writeTOC();
00134   void writeCache( bool lastRun = false );
00136   void lastWriteCache();
00137 
00139   void mergeCacheSegments();
00141   void writeCacheSegment();
00143   void writeDocMgrIDs();
00146   int docMgrID(const string &mgr);
00148   virtual void doendDoc(const DocumentProps* dp, int mgrid);
00150   int listlengths;
00151   
00152 public:
00154 
00155 
00157   bool open(const string &indexName);
00159 
00161 
00162 
00164   TERMID_T term(const TERM_T &word) const;
00165 
00167   const TERM_T term(TERMID_T termID) const;
00168 
00170   DOCID_T document(const EXDOCID_T &docIDStr) const;
00171 
00173   const EXDOCID_T document(DOCID_T docID) const; 
00174 
00176   const DocumentManager *docManager(DOCID_T docID) const;
00177 
00179 
00181 
00182 
00184   COUNT_T docCount() const { return counts[DOCS]; };
00185 
00187   COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; };
00188 
00190   COUNT_T termCount(TERMID_T termID) const;
00191 
00193   COUNT_T termCount() const { return counts[TOTAL_TERMS]; };
00194 
00196   float docLengthAvg() const;
00197 
00199   COUNT_T docCount(TERMID_T termID) const;
00200 
00202   COUNT_T docLength(DOCID_T docID) const;
00203 
00205   virtual COUNT_T totaldocLength (DOCID_T docID) const;
00206 
00208   COUNT_T docLengthCounted(DOCID_T docID) const;
00209 
00211 
00213 
00214 
00215   DocInfoList* docInfoList(TERMID_T termID) const;
00216 
00218   TermInfoList* termInfoList(DOCID_T docID) const;
00220   TermInfoList* termInfoListSeq(DOCID_T docID) const;
00221 
00223 
00225   void setMesgStream(ostream * lemStream);
00227   void addKnownTerm( TERMID_T termID, LOC_T position );
00229   TERMID_T addUnknownTerm( const InvFPTerm* term );
00231   TERMID_T addUncachedTerm( const InvFPTerm* term );
00232 
00233 protected:
00235   void openDBs();
00237   void openSegments();
00239   void createDBs();
00240 
00242   void fullToc();
00244   bool docMgrIDs();
00246   record fetchDocumentRecord( DOCID_T key ) const;
00248   void addDocumentLookup( DOCID_T documentKey, const char* documentName );
00250   void addTermLookup( TERMID_T termKey, const char* termSpelling );
00252   void addGeneralLookup( Keyfile& numberNameIndex, Keyfile& nameNumberIndex, 
00253                          TERMID_T number, const char* name );
00255   InvFPDocList* internalDocInfoList(TERMID_T termID) const;
00257   void _updateTermlist( InvFPDocList* curlist, LOC_T position );
00259   int _cacheSize();
00261   void _computeMemoryBounds( int memorySize );
00263   void _resetEstimatePoint();
00265   COUNT_T* counts;    
00267   std::vector<std::string> names;
00269   float aveDocLen; 
00271   vector<std::string> docmgrs;
00273   ostream* msgstream;
00274 
00275   // All database handles are marked mutable since they sometimes
00276   // must be used to fetch values during const methods
00278   mutable Keyfile invlookup;
00279   
00280   // int <-> string mappings for documents and terms
00282   mutable Keyfile dIDs;
00284   mutable Keyfile dSTRs;
00286   mutable Keyfile tIDs;
00288   mutable Keyfile tSTRs;
00290   mutable File dtlookup; 
00292   ReadBuffer* dtlookupReadBuffer; 
00295   mutable File writetlist; 
00296 
00298   mutable char termKey[MAX_TERM_LENGTH];
00300   mutable char docKey[MAX_DOCID_LENGTH];
00302   int _listsSize;
00304   int _memorySize;
00306   std::string name;
00308   vector<InvFPDocList*> invertlists; 
00310   vector<LocatedTerm> termlist; 
00312   int curdocmgr; 
00314   vector<DocumentManager*> docMgrs; 
00316   TermCache _cache;
00317 
00319   std::vector<File*> _segments;
00321   TERMID_T _largestFlushedTermID;
00323   int _estimatePoint; 
00325   bool ignoreDoc;  
00326 };
00327 
00328 
00329 #endif //_LEMUR_KEYFILE_INCINDEX_HPP

Generated on Wed Nov 3 12:58:59 2004 for Lemur Toolkit by doxygen1.2.18