00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 /* 00012 10/22/2002 -- dmf Add arrays dtfstreams and invfstreams to hold open 00013 ifstreams for the inverted list files so that each is opened and closed 00014 only once. 00015 */ 00016 00017 #ifndef _INVINDEX_HPP 00018 #define _INVINDEX_HPP 00019 00021 #include "common_headers.hpp" 00022 #include "Index.hpp" 00023 #include "InvDocList.hpp" 00024 #include "InvTermList.hpp" 00025 #include "InvFPTypes.hpp" 00026 #include "Param.hpp" 00027 #include "DocMgrManager.hpp" 00028 00029 // for counts array 00030 #define UNIQUE_TERMS 0 00031 #define TOTAL_TERMS 1 00032 #define DOCS 2 00033 #define DT_FILES 3 00034 #define INV_FILES 4 00035 00036 #define NAMES_SIZE 8 00037 // for names array 00038 #define DOC_INDEX 0 00039 #define DOC_LOOKUP 1 00040 #define TERM_INDEX 2 00041 #define TERM_LOOKUP 3 00042 #define TERM_IDS 4 00043 #define DOC_IDS 5 00044 #define DOCMGR_IDS 6 00045 #define VERSION_NUM 7 00046 00047 class InvIndex : public Index { 00048 public: 00049 InvIndex(); 00050 InvIndex(const string &indexName); 00051 ~InvIndex(); 00052 00054 00055 00057 bool open(const string &indexName); 00059 00061 00062 00064 TERMID_T term(const TERM_T &word) const; 00065 00067 const TERM_T term(TERMID_T termID) const; 00068 00070 DOCID_T document(const EXDOCID_T &docIDStr) const; 00071 00073 const EXDOCID_T document(DOCID_T docID) const; 00074 00075 const DocumentManager* docManager(DOCID_T docID) const; 00076 00078 00080 00081 00083 COUNT_T docCount() const { return counts[DOCS]; }; 00084 00086 COUNT_T termCountUnique() const { return counts[UNIQUE_TERMS]; }; 00087 00089 COUNT_T termCount(TERMID_T termID) const; 00090 00092 COUNT_T termCount() const { return counts[TOTAL_TERMS]; }; 00093 00095 float docLengthAvg() const; 00096 00098 COUNT_T docCount(TERMID_T termID) const; 00099 00101 COUNT_T docLength(DOCID_T docID) const; 00102 00104 virtual COUNT_T docLengthCounted(DOCID_T docID) const; 00105 00107 00109 00110 00111 DocInfoList* docInfoList(TERMID_T termID) const; 00112 00114 TermInfoList* termInfoList(DOCID_T docID) const; 00115 00117 00119 void setMesgStream(ostream * lemStream); 00120 00121 protected: 00123 bool fullToc(const string &fileName); 00125 bool indexLookup(); 00127 bool invFileIDs(); 00129 bool docMgrIDs(); 00131 bool dtLookup(); 00133 bool dtLookup_ver1(); 00135 bool dtFileIDs(); 00137 bool termIDs(); 00139 bool docIDs(); 00140 00141 00142 LOC_T* counts; // array to hold all the overall count stats of this db 00143 string *names; // array to hold all the names for files we need for this db 00144 float aveDocLen; // the average document length in this index 00145 inv_entry* lookup; // the array holding entries (index is termid) 00146 dt_entry* dtlookup; // the array holding entries to dt index (index of array is docid) 00147 int dtloaded; // indicate load status of the dt index (loaded or not) 00148 TERM_T* terms; // array of the term spellings (index is termid) 00149 EXDOCID_T* docnames; // array of the external docids (index is docid) 00150 string * dtfiles; // array of dt index filenames 00151 ifstream *dtfstreams; // array of dt index input streams 00152 string * invfiles; // array of inv index filenames 00153 ifstream *invfstreams; // array of inv index input streams 00154 vector<DocumentManager*> docmgrs; // list of document managers 00155 // we make them mutable, but they don't actually get changed. this seems to be necessary for map::find method 00156 mutable map<TERM_T, TERMID_T, less<TERM_T> > termtable; // table of terms to termid 00157 mutable map<EXDOCID_T, DOCID_T, less<EXDOCID_T> > doctable; // table of exdocids to docid 00158 ostream* msgstream; // Lemur code messages stream 00159 }; 00160 00161 #endif