00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef _UNIGRAMLM_HPP
00014 #define _UNIGRAMLM_HPP
00015
00016 #include "Counter.hpp"
00017 #include "Exception.hpp"
00018 #include "IndexTypes.hpp"
00019 #include <cstring>
00021
00026 class UnigramLM {
00027 public:
00029 virtual double prob(TERMID_T wordIndex) const = 0;
00031 virtual const string lexiconID() const= 0;
00032
00034 virtual void startIteration() const = 0;
00035 virtual bool hasMore() const = 0;
00036 virtual void nextWordProb(TERMID_T &wordIndex, double &prob) const = 0;
00037 };
00038
00039
00041
00042 class SmoothedMLEstimator : public UnigramLM {
00043 public:
00044 SmoothedMLEstimator(const Counter &counter, const string &lexiconID) : ct(counter), lexID(lexiconID) {}
00045 virtual ~SmoothedMLEstimator() {}
00046
00047 virtual double prob(TERMID_T wordIndex) const {
00048 return (probEstimate(wordIndex, ct.count(wordIndex),ct.sum()));
00049 }
00050
00051 virtual void startIteration() const {
00052 ct.startIteration();
00053 }
00054
00055 virtual bool hasMore() const {
00056 return ct.hasMore();
00057 }
00058
00059 virtual void nextWordProb(TERMID_T &wordIndex, double &prob) const{
00060 double count;
00061
00062 ct.nextCount((int&)wordIndex, count);
00063 prob = probEstimate(wordIndex, count, ct.sum());
00064 }
00065
00066 virtual const string lexiconID() const { return lexID;}
00067
00069 virtual double probEstimate(TERMID_T wordIndex, double wdCount, double sumCount) const=0;
00070
00071 protected:
00072 const Counter &ct;
00073 const string lexID;
00074 };
00075
00077
00078 class MLUnigramLM : public SmoothedMLEstimator {
00079 public:
00080 MLUnigramLM(const Counter & counter, const string &lexiconID) : SmoothedMLEstimator(counter, lexiconID) {};
00081 virtual ~MLUnigramLM() {}
00082
00083 virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const{
00084 return (count/sum);
00085 }
00086 };
00087
00089 class LaplaceUnigramLM : public SmoothedMLEstimator {
00090 public:
00091 LaplaceUnigramLM(const Counter & counter, const string &lexiconID, double vocabSize) : SmoothedMLEstimator(counter, lexiconID), vocSz(vocabSize) {};
00092 virtual ~LaplaceUnigramLM() {}
00093
00094 virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00095 return ((count+1)/(sum+vocSz));
00096 }
00097 private:
00098 double vocSz;
00099 };
00100
00101
00103
00104 class DirichletUnigramLM : public SmoothedMLEstimator {
00105 public:
00106 DirichletUnigramLM(const Counter & counter, const string &lexiconID,
00107 const UnigramLM &refLM, double priorSampleSize)
00108 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00109 s(priorSampleSize) {}
00110
00111 virtual ~DirichletUnigramLM() {}
00112
00113 virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00114 return ((count+s*ref->prob(wordIndex))/(sum+s));
00115 }
00116
00117 private:
00118 const UnigramLM *ref;
00120 double s;
00121 };
00122
00123
00124
00125
00126
00127
00129
00130 class InterpUnigramLM : public SmoothedMLEstimator {
00131 public:
00132 InterpUnigramLM(const Counter & counter, const string &lexiconID,
00133 const UnigramLM &refLM, double refCoeff)
00134 : SmoothedMLEstimator(counter, lexiconID), ref(&refLM),
00135 refC(refCoeff) {}
00136
00137 virtual ~InterpUnigramLM() {}
00138
00139 virtual double probEstimate(TERMID_T wordIndex, double count, double sum) const {
00140 return ((1-refC)*count/sum + refC*ref->prob(wordIndex));
00141 }
00142
00143 private:
00144 const UnigramLM *ref;
00146 double refC;
00147 };
00148
00149
00150
00151
00152
00153 #endif