00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 #ifndef _FREQCOUNTER_HPP 00013 #define _FREQCOUNTER_HPP 00014 00015 #include <map> 00016 #include <set> 00017 #include "InvFPTypes.hpp" 00018 #include "TextHandler.hpp" 00019 #include "Stopper.hpp" 00020 00022 #define R_CTF 0 00023 00024 #define R_DF 1 00025 00026 #define R_AVE_TF 2 00027 00028 #define R_UNIFORM 3 00029 00031 typedef struct freqinfo_tt { 00032 char * word; 00033 int ctf; 00034 int df; 00035 } freqinfo_t; 00036 00038 typedef map<char *, freqinfo_t, ltstr> freqmap; 00040 typedef set<char *, ltstr> stringset; 00041 00042 00048 class FreqCounter : public TextHandler { 00049 00050 public: 00053 FreqCounter(const Stopper * stopWords = NULL); 00056 FreqCounter(const string &filename, const Stopper * stopWords = NULL); 00057 00059 ~FreqCounter(); 00060 00062 void clear(); 00063 00065 void output(const string &filename) const; 00066 00069 char * randomWord(); 00076 void setRandomMode(int mode); 00078 int getRandomMode() const; 00079 00082 char * randomCtf() const; 00085 char * randomDf() const; 00088 char * randomAveTf() const; 00091 char * randomUniform() const; 00092 00093 00095 int numWords() const; 00097 int totWords() const; 00098 00100 const freqmap * getFreqInfo() const; 00101 00103 int getCtf(const char * word) const; 00105 int getDf(const char * word) const; 00107 double getAveTf(const char * word) const; 00108 00110 double ctfRatio(FreqCounter & lm1) const; 00111 00113 char * handleDoc(char * docno); 00115 char * handleWord(char * word); 00116 00118 void endDoc(); 00119 00121 void setName(const string &freqCounterName); 00123 const string & getName() const; 00124 00126 void pruneBottomWords(int topWords); 00127 00128 00129 protected: 00130 /* Loads a language model from file. */ 00131 void input(const string &filename); 00132 00133 /* Collection term frequencies. */ 00134 mutable freqmap freqInfo; 00135 00136 /* Words in a doc. */ 00137 stringset doc; 00138 /* Random words returned so far. */ 00139 stringset randdone; 00140 00141 /* The frequency counter's name. */ 00142 string name; 00143 00144 /* Stopword list */ 00145 const Stopper * stopper; 00146 00147 00148 /* used for calculating probabilities when 00149 * selecting a random word 00150 */ 00151 /* Sum over words of ctf. */ 00152 long ctfTot; 00153 /* Sum over words of df. */ 00154 int dfTot; 00155 /* Sum over words of average tf. */ 00156 mutable long double avetfTot; 00157 /* Indicates whether avetfTot is valid (true) 00158 * or needs to be recalculated (false). */ 00159 mutable bool atfValid; 00160 /* Random selection mode. */ 00161 int randomMode; 00162 /* Number of unique words. */ 00163 int nWords; 00164 00165 00166 }; 00167 00168 00169 00170 #endif