00001 /*========================================================================== 00002 * 00003 * Original source copyright (c) 2001, Carnegie Mellon University. 00004 * See copyright.cmu for details. 00005 * Modifications copyright (c) 2002, University of Massachusetts. 00006 * See copyright.umass for details. 00007 * 00008 *========================================================================== 00009 */ 00010 00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP 00012 #define _LEMUR_KEYFILE_DOCMGR_HPP 00013 00014 #include "common_headers.hpp" 00015 #include "DocumentManager.hpp" 00016 #include "RVLCompress.hpp" 00017 #include "TextHandlerManager.hpp" 00018 #include "Match.hpp" 00019 #include "Keyfile.hpp" 00020 00021 // array of byte offsets, indexed by token for each doc. 00022 #define BT_POSITIONS ".btp" 00023 // source file start, length 00024 #define BT_LOOKUP ".btl" 00025 // TOC 00026 #define BT_TOC ".bdm" 00027 // source files. 00028 #define BT_FID ".bfi" 00029 00030 00039 class KeyfileDocMgr : public DocumentManager, public TextHandler { 00040 public: 00042 KeyfileDocMgr() { myDoc = NULL; numdocs = 0; ignoreDoc = false; } 00043 00046 KeyfileDocMgr(const string &name); 00047 00052 KeyfileDocMgr(string name, string mode, string source); 00053 00054 virtual ~KeyfileDocMgr(); 00055 00057 char* getDoc(const string &docID) const; 00059 virtual char* handleDoc(char * docno); 00061 virtual void handleEndDoc(); 00063 virtual char *handleWord(char * word) { 00064 if (!ignoreDoc && word != NULL) { 00065 int end = myparser->fileTell() - 1; 00066 int start = (end - strlen(word)) + 1; 00067 Match m; 00068 m.start = start - docEntry.offset; 00069 m.end = end - docEntry.offset; 00070 offsets.push_back(m); 00071 } 00072 return word; 00073 } 00075 virtual void setParser(Parser *p) { 00076 myparser = p; 00077 } 00078 00079 virtual Parser* getParser() const { 00080 return (TextHandlerManager::createParser(pm)); 00081 } 00082 00085 virtual void buildMgr(); 00087 virtual const string &getMyID() const{ 00088 return IDnameext; 00089 } 00090 00094 vector<Match> getOffsets(const string &docID) const; 00095 00097 virtual bool open(const string &manname) { 00098 IDnameext = manname; 00099 IDname = manname.substr(0, manname.length() - 4); 00100 return loadTOC(); 00101 } 00102 00103 protected: 00104 struct btl { 00105 int fid; 00106 long offset; 00107 long bytes; 00108 }; 00109 00110 Parser *myparser; 00111 virtual void writeTOC(); 00112 virtual bool loadTOC(); 00113 bool loadFTFiles(const string &fn, int num); 00114 // the return object 00115 mutable vector <Match> offsets; 00116 int numdocs; // how many docs we have 00117 string pm; // parse mode 00118 00119 mutable Keyfile poslookup; // btree for lookup to positions list. 00120 mutable Keyfile doclookup; // btree for lookup to doc start. 00121 int dbcache; 00122 00123 btl docEntry; 00124 char *myDoc; 00125 int doclen; 00126 string IDname; // my name 00127 string IDnameext; // my name w/ extension 00128 vector<string> sources; // list of all source files 00130 int numOldSources; 00131 int fileid; // fileid of current/last file being processed 00133 bool ignoreDoc; 00134 }; 00135 00136 #endif // _LEMUR_KEYFILE_DOCMGR_HPP