Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

FreqCounter.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 #ifndef _FREQCOUNTER_HPP
00013 #define _FREQCOUNTER_HPP
00014 
00015 #include <map>
00016 #include <set>
00017 #include "InvFPTypes.hpp"
00018 #include "TextHandler.hpp"
00019 #include "Stopper.hpp"
00020 
00022 #define R_CTF 0
00023 
00024 #define R_DF 1
00025 
00026 #define R_AVE_TF 2
00027 
00028 #define R_UNIFORM 3
00029 
00031 typedef struct freqinfo_tt {
00032   char * word;
00033   int ctf;
00034   int df;
00035 } freqinfo_t;
00036 
00038 typedef map<char *, freqinfo_t, ltstr> freqmap;
00040 typedef set<char *, ltstr> stringset;
00041 
00042 
00048 class FreqCounter : public TextHandler {
00049 
00050 public:
00053   FreqCounter(const Stopper * stopWords = NULL);
00056   FreqCounter(const string &filename, const Stopper * stopWords = NULL);
00057   
00059   ~FreqCounter();
00060 
00062   void clear();
00063 
00065   void output(const string &filename) const;
00066 
00069   char * randomWord();
00076   void setRandomMode(int mode);
00078   int getRandomMode() const;
00079 
00082   char * randomCtf() const;
00085   char * randomDf() const;
00088   char * randomAveTf() const;
00091   char * randomUniform() const;
00092 
00093 
00095   int numWords() const;
00097   int totWords() const;
00098 
00100   const freqmap * getFreqInfo() const;
00101 
00103   int getCtf(const char * word) const;
00105   int getDf(const char * word) const;
00107   double getAveTf(const char * word) const;
00108 
00110   double ctfRatio(FreqCounter & lm1) const;
00111 
00113   char * handleDoc(char * docno);
00115   char * handleWord(char * word);
00116 
00118   void endDoc();
00119 
00121   void setName(const string &freqCounterName);
00123   const string & getName() const;
00124 
00126   void pruneBottomWords(int topWords);
00127   
00128 
00129 protected:
00130   /* Loads a language model from file. */
00131   void input(const string &filename);
00132 
00133   /* Collection term frequencies. */
00134   mutable freqmap freqInfo;
00135 
00136   /* Words in a doc. */
00137   stringset doc;
00138   /* Random words returned so far. */
00139   stringset randdone;
00140 
00141   /* The frequency counter's name. */
00142   string name;
00143 
00144   /* Stopword list */
00145   const Stopper * stopper;
00146 
00147 
00148   /* used for calculating probabilities when
00149    * selecting a random word
00150    */
00151   /* Sum over words of ctf. */
00152   long ctfTot;
00153   /* Sum over words of df. */
00154   int dfTot;  
00155   /* Sum over words of average tf. */
00156   mutable long double avetfTot;
00157   /* Indicates whether avetfTot is valid (true)
00158    * or needs to be recalculated (false). */
00159   mutable bool atfValid;
00160   /* Random selection mode. */
00161   int randomMode;
00162   /* Number of unique words. */
00163   int nWords;
00164 
00165 
00166 };
00167 
00168 
00169 
00170 #endif

Generated on Wed Nov 3 12:58:55 2004 for Lemur Toolkit by doxygen1.2.18