Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

KeyfileDocMgr.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 #ifndef _LEMUR_KEYFILE_DOCMGR_HPP
00012 #define _LEMUR_KEYFILE_DOCMGR_HPP
00013 
00014 #include "common_headers.hpp"
00015 #include "DocumentManager.hpp"
00016 #include "RVLCompress.hpp"
00017 #include "TextHandlerManager.hpp"
00018 #include "Match.hpp"
00019 #include "Keyfile.hpp"
00020 
00021 // array of byte offsets, indexed by token for each doc.
00022 #define BT_POSITIONS ".btp"
00023 // source file start, length
00024 #define BT_LOOKUP ".btl"
00025 // TOC
00026 #define BT_TOC ".bdm"
00027 // source files.
00028 #define BT_FID ".bfi"
00029 
00030 
00039 class KeyfileDocMgr : public DocumentManager, public TextHandler {
00040 public:
00042   KeyfileDocMgr() {  myDoc = NULL;  numdocs = 0; ignoreDoc = false; }
00043 
00046   KeyfileDocMgr(const string &name);
00047 
00052   KeyfileDocMgr(string name, string mode, string source);  
00053 
00054   virtual ~KeyfileDocMgr();
00055 
00057   char* getDoc(const string &docID) const;
00059   virtual char* handleDoc(char * docno);
00061   virtual void handleEndDoc();
00063   virtual char *handleWord(char * word) {
00064     if (!ignoreDoc && word != NULL) {
00065       int end = myparser->fileTell() - 1;
00066       int start = (end - strlen(word)) + 1;
00067       Match m;
00068       m.start = start - docEntry.offset;
00069       m.end = end - docEntry.offset;    
00070       offsets.push_back(m);
00071     }
00072     return word;
00073   }
00075   virtual void setParser(Parser *p) {
00076     myparser = p;
00077   }
00078 
00079   virtual Parser* getParser() const {
00080     return (TextHandlerManager::createParser(pm));
00081   }
00082 
00085   virtual void buildMgr();
00087   virtual const string &getMyID() const{
00088     return IDnameext;
00089   }
00090 
00094   vector<Match> getOffsets(const string &docID) const;
00095 
00097   virtual bool open(const string &manname) {
00098     IDnameext = manname;
00099     IDname = manname.substr(0, manname.length() - 4);
00100     return loadTOC();
00101   }
00102 
00103 protected:
00104   struct btl {
00105     int fid;
00106     long offset;
00107     long bytes;
00108   };
00109 
00110   Parser *myparser;
00111   virtual void writeTOC();
00112   virtual bool loadTOC();
00113   bool loadFTFiles(const string &fn, int num);
00114   // the return object
00115   mutable vector <Match> offsets;
00116   int numdocs;              // how many docs we have
00117   string pm;  // parse mode
00118 
00119   mutable Keyfile poslookup; // btree for lookup to positions list.
00120   mutable Keyfile doclookup; // btree for lookup to doc start.
00121   int dbcache;
00122   
00123   btl docEntry;
00124   char *myDoc;
00125   int doclen;
00126   string IDname;            // my name
00127   string IDnameext;                     // my name w/ extension
00128   vector<string> sources;   // list of all source files
00130   int numOldSources;
00131   int fileid;       // fileid of current/last file being processed
00133   bool ignoreDoc;
00134 };
00135 
00136 #endif // _LEMUR_KEYFILE_DOCMGR_HPP

Generated on Wed Nov 3 12:58:59 2004 for Lemur Toolkit by doxygen1.2.18