Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

InvDocList.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _INVDOCLIST_HPP
00014 #define _INVDOCLIST_HPP
00015 
00016 /*
00017  * NAME DATE - COMMENTS
00018  * tnt 03/2001 - created
00019  *
00020  *========================================================================*/
00021 #include <cmath>
00022 #include "InvFPTypes.hpp"
00023 #include "common_headers.hpp"
00024 #include "DocInfoList.hpp"
00025 #include "MemCache.hpp"
00026 #include "RVLCompress.hpp"
00027 
00028 extern "C" {
00029   #include <cstdio>
00030 }
00031 
00032 #define DEFAULT 9
00033 
00034 class InvDocList: public DocInfoList {
00035 public:
00036   InvDocList();
00037 
00040   InvDocList(TERMID_T id, int len);
00042   InvDocList(MemCache* mc, TERMID_T id, int len);  
00043   InvDocList(MemCache* mc, TERMID_T id, int len, DOCID_T docid, LOC_T location);
00045   InvDocList(TERMID_T id, int listlen, LOC_T* list, int fr, DOCID_T* ldocid, int len);
00046   ~InvDocList();
00047 
00053   void setList(TERMID_T id, int listlen, LOC_T* list, int fr, DOCID_T* ldocid=NULL, int len=0);
00054 
00058   void setListSafe(TERMID_T id, int listlen, LOC_T* list, int fr, DOCID_T* ldocid, int len);
00059 
00063   void reset();
00064 
00067   void resetFree();
00068 
00069   bool allocMem();
00070   bool hasNoMem();
00071 
00073   virtual bool addTerm(DOCID_T docid);
00074 
00076   virtual bool append(InvDocList* tail);
00077 
00078   virtual void startIteration() const;
00079   virtual bool hasMore() const;
00080   virtual DocInfo* nextEntry() const;
00081   virtual void nextEntry(DocInfo* info) const;
00082 
00083   DOCID_T curDocID() const{ if (lastid == NULL) return -1; return *lastid; };
00084   COUNT_T docFreq() const{ return df; };
00085   int length() const{ return end-begin; };
00086   TERMID_T termID() const{ return uid; };
00087   int termLen() const{ return strlength; };
00088   virtual COUNT_T termCTF() const;
00089   int curDocIDdiff() const{ return lastid-begin; };
00090   int curDocIDtf() const{ return *(lastid+1); };
00091   int memorySize() const{ return size; };
00092 
00094   void binWrite(ofstream& of);
00095 
00097   bool binRead(ifstream& inf);
00098 
00100   void binWriteC(ofstream& of);
00101 
00103   bool binReadC(ifstream& inf);
00104 
00105 protected:
00106   // Helper functions for iterator, subclasses should override
00108   virtual DocInfo* getElement(DocInfo* elem, POS_T position) const;
00110   virtual POS_T beginPosition() const { return (POS_T) begin; }
00112   virtual POS_T endPosition() const { return (POS_T) end; }
00114   virtual POS_T nextPosition(POS_T position) const;
00115 
00119   bool getMoreMem();
00120   int logb2(int num);
00121 
00124   virtual void deltaEncode();
00125 
00128   virtual void deltaDecode();
00129 
00130   // Use LOC_T* for TERMID/DOCID/COUNT/LOC.
00131   LOC_T* begin;         // pointer to the beginning of this list
00132   LOC_T* lastid;        // pointer to the most recent DocID added
00133   LOC_T* freq;          // pointer to the frequency of the last DocID
00134   LOC_T * end;            // pointer to the next free memory
00135   mutable LOC_T* iter;    // pointer tells us where we are in iteration
00136   int  size;            // how big are we, increment in powers of 2, start at 16K
00137   int  LOC_Tsize;       // sizeof(LOC_T) value
00138   int  strlength;       // the character length of our corresponding string
00139   TERMID_T  uid;                          // a unique ID for our string
00140   COUNT_T  df;                    // the document frequency for current term
00141   MemCache* cache;      // the cache to get memory from
00142   bool hascache;        // remember if we have our own cache
00143 
00144   bool READ_ONLY;    // flag for whether this list can be added
00145 private:
00146   mutable DocInfo entry;
00147 };
00148 
00149 #endif

Generated on Wed Nov 3 12:58:58 2004 for Lemur Toolkit by doxygen1.2.18