Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

InvIndex.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 /*
00012   10/22/2002 -- dmf Add arrays dtfstreams and invfstreams to hold open
00013   ifstreams for the inverted list files so that each is opened and closed
00014   only once.
00015  */
00016 
00017 #ifndef _INVINDEX_HPP
00018 #define _INVINDEX_HPP
00019 
00021 #include "common_headers.hpp"
00022 #include "Index.hpp"
00023 #include "InvDocList.hpp"
00024 #include "InvTermList.hpp"
00025 #include "InvFPTypes.hpp"
00026 #include "Param.hpp"
00027 #include "DocMgrManager.hpp"
00028 
00029 // for counts array
00030 #define UNIQUE_TERMS 0
00031 #define TOTAL_TERMS  1
00032 #define DOCS         2
00033 #define DT_FILES     3
00034 #define INV_FILES    4
00035 
00036 #define NAMES_SIZE   8
00037 // for names array
00038 #define DOC_INDEX    0
00039 #define DOC_LOOKUP   1
00040 #define TERM_INDEX   2
00041 #define TERM_LOOKUP  3
00042 #define TERM_IDS     4
00043 #define DOC_IDS      5
00044 #define DOCMGR_IDS   6
00045 #define VERSION_NUM  7
00046 
00047 class InvIndex : public Index {
00048 public:
00049    InvIndex();
00050    InvIndex(const string &indexName);
00051   ~InvIndex(); 
00052 
00054 
00055 
00057   bool open(const string &indexName);
00059 
00061 
00062 
00064   TERMID_T term(const TERM_T &word) const;
00065 
00067   const TERM_T term(TERMID_T termID) const;
00068 
00070   DOCID_T document(const EXDOCID_T &docIDStr) const;
00071 
00073   const EXDOCID_T document(DOCID_T docID) const; 
00074 
00075   const DocumentManager* docManager(DOCID_T docID) const;
00076 
00078 
00080 
00081 
00083   COUNT_T docCount() const { return counts[DOCS]; };
00084 
00086   COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; };
00087 
00089   COUNT_T termCount(TERMID_T termID) const;
00090 
00092   COUNT_T termCount() const { return counts[TOTAL_TERMS]; };
00093 
00095   float docLengthAvg() const;
00096 
00098   COUNT_T docCount(TERMID_T termID) const;
00099 
00101   COUNT_T docLength(DOCID_T docID) const;
00102 
00104   virtual COUNT_T docLengthCounted(DOCID_T docID) const;
00105 
00107 
00109 
00110 
00111   DocInfoList* docInfoList(TERMID_T termID) const;
00112 
00114   TermInfoList* termInfoList(DOCID_T docID) const;
00115 
00117 
00119  void setMesgStream(ostream * lemStream);
00120 
00121 protected:
00123   bool fullToc(const string &fileName);
00125   bool indexLookup();
00127   bool invFileIDs();
00129   bool docMgrIDs();
00131   bool dtLookup();
00133   bool dtLookup_ver1();
00135   bool dtFileIDs();
00137   bool termIDs();
00139   bool docIDs();
00140 
00141 
00142   LOC_T* counts;    // array to hold all the overall count stats of this db
00143   string *names;  // array to hold all the names for files we need for this db
00144   float aveDocLen; // the average document length in this index
00145   inv_entry* lookup;  // the array holding entries (index is termid)
00146   dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid)
00147   int dtloaded; // indicate load status of the dt index (loaded or not)
00148   TERM_T* terms;   // array of the term spellings (index is termid)
00149   EXDOCID_T* docnames; // array of the external docids (index is docid)
00150   string * dtfiles; // array of dt index filenames
00151   ifstream *dtfstreams; // array of dt index input streams
00152   string * invfiles; // array of inv index filenames
00153   ifstream *invfstreams; // array of inv index input streams
00154   vector<DocumentManager*> docmgrs; // list of document managers
00155   // we make them mutable, but they don't actually get changed. this seems to be necessary for map::find method
00156   mutable map<TERM_T, TERMID_T, less<TERM_T> > termtable; // table of terms to termid
00157   mutable map<EXDOCID_T, DOCID_T, less<EXDOCID_T> > doctable; // table of exdocids to docid
00158   ostream* msgstream; // Lemur code messages stream             
00159 };
00160 
00161 #endif

Generated on Wed Nov 3 12:58:58 2004 for Lemur Toolkit by doxygen1.2.18