Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

UnigramLM.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2001 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015 
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include "IndexTypes.hpp"
00019 #include <cstring>
00021 
00026 class UnigramLM {
00027 public:
00029   virtual double prob(TERMID_T wordIndex) const = 0;
00031   virtual const string lexiconID() const= 0;
00032 
00034   virtual void startIteration() const = 0;
00035   virtual bool hasMore() const = 0;
00036   virtual void nextWordProb(TERMID_T &wordIndex, double &prob) const = 0;
00037 };
00038 
00039 
00041 
00042 class SmoothedMLEstimator : public UnigramLM {
00043 public:
00044   SmoothedMLEstimator(const Counter &counter, const string &lexiconID) : ct(counter), lexID(lexiconID) {}
00045   virtual ~SmoothedMLEstimator() {}
00046 
00047   virtual double prob(TERMID_T wordIndex) const {
00048     return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00049   }
00050 
00051   virtual void startIteration() const {
00052     ct.startIteration();
00053   }
00054 
00055   virtual bool hasMore() const {
00056     return ct.hasMore();
00057   }
00058 
00059   virtual void nextWordProb(TERMID_T &wordIndex, double &prob) const{
00060     double count;
00061     //dmf FIXME
00062     ct.nextCount((int&)wordIndex, count);
00063     prob = probEstimate(wordIndex, count, ct.sum());
00064   }
00065   
00066   virtual const string lexiconID() const { return lexID;}
00067 
00069   virtual double probEstimate(TERMID_T wordIndex, double wdCount, double sumCount) const=0;
00070 
00071 protected:
00072   const Counter &ct;
00073   const string lexID;
00074 };
00075   
00077 
00078 class MLUnigramLM : public SmoothedMLEstimator { 
00079 public:
00080   MLUnigramLM(const Counter & counter, const string &lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00081   virtual ~MLUnigramLM() {}
00082   
00083   virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const{
00084     return (count/sum);
00085   }
00086 };
00087 
00089 class LaplaceUnigramLM : public SmoothedMLEstimator { 
00090 public:
00091   LaplaceUnigramLM(const Counter & counter, const string &lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00092   virtual ~LaplaceUnigramLM() {}
00093   
00094   virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00095     return ((count+1)/(sum+vocSz));
00096   }
00097 private:
00098   double vocSz;
00099 };
00100 
00101 
00103 
00104 class DirichletUnigramLM : public SmoothedMLEstimator { 
00105 public:
00106   DirichletUnigramLM(const Counter & counter, const string &lexiconID, 
00107                      const UnigramLM &refLM, double priorSampleSize) 
00108     : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00109     s(priorSampleSize) {}
00110 
00111   virtual ~DirichletUnigramLM() {}
00112   
00113   virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00114     return ((count+s*ref->prob(wordIndex))/(sum+s));
00115   }
00116 
00117 private:
00118   const UnigramLM *ref;
00120   double s;  
00121 };
00122 
00123 
00124 
00125 
00126 
00127 
00129 
00130 class InterpUnigramLM : public SmoothedMLEstimator { 
00131 public:
00132   InterpUnigramLM(const Counter & counter, const string &lexiconID, 
00133                      const UnigramLM &refLM, double refCoeff) 
00134     : SmoothedMLEstimator(counter, lexiconID), ref(&refLM), 
00135     refC(refCoeff) {}
00136 
00137   virtual ~InterpUnigramLM() {}
00138   
00139   virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00140     return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00141   }
00142 
00143 private:
00144   const UnigramLM *ref;
00146   double refC;  
00147 };
00148 
00149 
00150 
00151 
00152 
00153 #endif /* _UNIGRAMLM_HPP */

Generated on Wed Nov 3 12:59:07 2004 for Lemur Toolkit by doxygen1.2.18