00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #ifndef _LEMUR_PDICT_HPP
00017 #define _LEMUR_PDICT_HPP
00018
00019 #include "common_headers.hpp"
00020 #include "algorithm"
00021 #include "Keyfile.hpp"
00022 #include "File.hpp"
00023 #include "TextHandlerManager.hpp"
00024
00027 class DictEntry {
00028 public:
00030 DictEntry();
00031
00033 DictEntry(const string &targ, const string &typ, double pr) : target(targ), type(typ),
00034 prob(pr) { }
00035
00039 char *toBytes(int &numBytes) const;
00040
00044 int toBytes(char *buffer) const;
00045
00048 int numBytes() const;
00052 int fromBytes(char *buffer);
00054 bool operator==(const DictEntry & a) const {
00055 return (target == a.target && type == a.type);
00056 }
00057 string toString(string delim = ";") const ;
00058
00060 string target;
00062 string type;
00064 double prob;
00065 };
00066
00070 class DictEntryFilter {
00071 public:
00074 virtual bool accept(const DictEntry &entry) const = 0;
00075 virtual ~DictEntryFilter() {}
00076 } ;
00077
00078
00082 class AllDictEntryFilter : public DictEntryFilter {
00083 public:
00086 bool accept(const DictEntry &entry) const { return true; }
00087 };
00088
00089
00093 class ProbDictEntryFilter : public DictEntryFilter {
00094 public:
00097 ProbDictEntryFilter(double thresh = 0.0) : threshold(thresh) {
00098 }
00102 bool accept(const DictEntry &entry) const { return entry.prob > threshold; }
00103 private:
00104 double threshold;
00105 };
00106
00107
00111 class TypeDictEntryFilter : public DictEntryFilter {
00112 public:
00115 TypeDictEntryFilter(const string &filtType) : type(filtType) {
00116 }
00120 bool accept(const DictEntry &entry) const { return entry.type == type; }
00121 private:
00122 string type;
00123 };
00124
00129 class StopwordDictEntryFilter : public DictEntryFilter {
00130 public:
00133 StopwordDictEntryFilter(const string &stopwords) {
00134 stopper = TextHandlerManager::createStopper(stopwords);
00135 }
00139 bool accept(const DictEntry &entry) const {
00140 return !(stopper->stopWord(entry.target.c_str()));
00141 }
00142 private:
00143 Stopper *stopper;
00144 };
00145
00147 class DictEntryVector : public vector<DictEntry> {
00148 public:
00149 DictEntryVector() : vector<DictEntry>() {
00150 }
00151 DictEntryVector(char *buffer, DictEntryFilter *filter);
00153 void sortScores() {
00154 sort(this->begin(), this->end(), cmpFn);
00155 }
00160 bool addEntry(DictEntry &entry, double (*compose)(double, double) = NULL);
00161
00165 bool removeEntry(DictEntry &entry);
00166
00170 char *toBytes(int &numBytes) const;
00174
00178 void toBytes(char *buffer) const;
00179
00180 void fromBytes(char *buffer, DictEntryFilter *filter);
00181
00184 int numEntries() const;
00185
00187 void normalize();
00188
00189 private:
00190 class DictEntryProbDescending {
00191 public:
00192 bool operator()(const DictEntry & a, const DictEntry & b) {
00193 return a.prob > b.prob;
00194 }
00195 };
00196 static DictEntryProbDescending cmpFn;
00197 };
00198
00200 struct dictStats {
00202 int dictSize;
00204 int sourceSize;
00206 int targetSize;
00207 };
00208
00212 class PDict {
00213 public:
00215 PDict();
00216
00218 ~PDict();
00219
00225 DictEntryVector *getTranslations(const string &term,
00226 DictEntryFilter *filter=NULL) const ;
00231 int numTranslations(const string &term,
00232 DictEntryFilter *filter=NULL) const;
00235 int getNumPairs() const;
00236
00239 int getSourceCount() const;
00240
00243 int getTargetCount() const ;
00244
00247 const string &getName() const {return name;}
00248
00251 bool isUsingCounts() const {return usingCounts;}
00252
00255 void setUsingCounts(bool val) {usingCounts = val;}
00256
00262 void add(const string &source, DictEntry &value,
00263 double (*compose)(double, double) = NULL);
00264
00268 void remove(const string &source, DictEntry &value);
00269
00272 void remove(const string &source);
00273
00278 void write(const string &outputName, const string &delim);
00279
00286 bool read(const string &dictName, const string &delim, bool counts = false);
00287
00292 bool open(const string &dictName);
00293
00298 bool create(const string &dictName);
00299
00302 void close();
00303
00306 void normalize();
00307
00309 void startIteration() {dict.setFirst();}
00310
00315 DictEntryVector *nextTranslations(string &term,
00316 DictEntryFilter *filter=NULL) const;
00317
00318 private:
00320 void writeTOC() const;
00322 bool contains(const string &term, Keyfile &keyfile) const;
00324 void flush();
00326 dictStats stats;
00328 DictEntryVector* currentVec;
00330 bool usingCounts;
00332 string currentTerm;
00334 string name;
00336 mutable Keyfile dict;
00338 mutable Keyfile targetIDs;
00340 mutable File dictEntries;
00341 };
00342
00343 #endif // _LEMUR_PDICT_HPP