00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _QRYBASEDSAMPLER_HPP 00013 #define _QRYBASEDSAMPLER_HPP 00014 00015 00016 00017 #include "FreqCounter.hpp" 00018 #include "DBManager.hpp" 00019 00021 //typedef stringset docidset; 00022 typedef set<docid_t, less<string> > docidset; 00023 00025 #define T_NDOCS 1 00026 00027 #define T_NWORDS 2 00028 00029 #define T_NQRYS 4 00030 00034 class QryBasedSampler { 00035 public: 00036 QryBasedSampler(); 00037 ~QryBasedSampler(); 00038 00040 bool probe(const char * initQuery); 00041 00043 void setDBManager(const DBManager * database); 00044 00046 const DBManager * getDBManager() const; 00047 00048 00051 void setFreqCounter(FreqCounter * counter); 00052 00054 const FreqCounter * getFreqCounter() const; 00055 00056 00060 void setOutputPrefix(const string &prefix); 00061 00063 const string &getOutputPrefix() const; 00064 00066 void setNumDocs(int n); 00067 00069 int getNumDocs() const; 00070 00071 00073 void setNumWords(int n); 00074 00076 int getNumWords() const; 00077 00078 00080 void setNumQueries(int n); 00081 00083 int getNumQueries() const; 00084 00085 00092 void setTermMode(int m); 00093 00095 int getTermMode() const; 00096 00097 00099 void setDocsPerQuery(int n); 00100 00102 int getDocsPerQuery() const; 00103 00104 00105 private: 00106 00107 /* for querying a db */ 00108 const DBManager * db; 00109 00110 00111 /* for building a description of a db */ 00112 FreqCounter * freqCounter; 00113 00114 00115 /* output prefix for filenames */ 00116 string outputPrefix; 00117 00118 00119 /* termination mode of the probe - 00120 * either T_NDOCS or T_NWORDS */ 00121 int termMode; 00122 00123 /* number unique docs to retrieve - only used if 00124 * termMode == T_NDOCS */ 00125 int numDocs; 00126 00127 /* number unique words to retrieve - only used if 00128 * termMode == T_NWORDS */ 00129 int numWords; 00130 00131 /* number of queries to run - only used if 00132 * termMode == T_NQRYS */ 00133 int numQueries; 00134 00135 /* documents per query to use */ 00136 int docsPerQuery; 00137 00138 /* stores the ids of the document already retrieved 00139 * from the system. used to prevent parsing 00140 * a document multiple times */ 00141 docidset seenDocs; 00142 }; 00143 00144 #endif