Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

PDict.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010  */
00011 
00012 
00013 /* 
00014  * dmf 02/2004
00015  */
00016 #ifndef _LEMUR_PDICT_HPP
00017 #define _LEMUR_PDICT_HPP
00018 
00019 #include "common_headers.hpp"
00020 #include "algorithm"
00021 #include "Keyfile.hpp"
00022 #include "File.hpp"
00023 #include "TextHandlerManager.hpp"
00024 
00027 class DictEntry {
00028 public:
00030   DictEntry();
00031 
00033   DictEntry(const string &targ, const string &typ, double pr) : target(targ), type(typ), 
00034                                                     prob(pr) { }
00035   
00039   char *toBytes(int &numBytes) const;
00040 
00044   int toBytes(char *buffer) const;
00045 
00048   int numBytes() const;
00052   int fromBytes(char *buffer);
00054   bool operator==(const DictEntry & a) const { 
00055     return (target == a.target && type == a.type); 
00056   }
00057   string toString(string delim = ";") const ;
00058     
00060   string target;
00062   string type;
00064   double prob;
00065 };
00066 
00070 class DictEntryFilter {
00071 public:
00074   virtual bool accept(const DictEntry &entry) const = 0;
00075   virtual ~DictEntryFilter() {}
00076 } ;
00077 
00078 
00082 class AllDictEntryFilter : public DictEntryFilter {
00083 public:
00086   bool accept(const DictEntry &entry) const { return true; }
00087 };
00088 
00089 
00093 class ProbDictEntryFilter : public DictEntryFilter {
00094 public:
00097   ProbDictEntryFilter(double thresh = 0.0) : threshold(thresh) {
00098   }
00102   bool accept(const DictEntry &entry) const { return entry.prob > threshold; }
00103 private:
00104   double threshold;
00105 };
00106 
00107 
00111 class TypeDictEntryFilter : public DictEntryFilter {
00112 public:
00115   TypeDictEntryFilter(const string &filtType) : type(filtType) {
00116   }
00120   bool accept(const DictEntry &entry) const { return entry.type == type; }
00121 private:
00122   string type;
00123 };
00124 
00129 class StopwordDictEntryFilter : public DictEntryFilter {
00130 public:
00133   StopwordDictEntryFilter(const string &stopwords) {
00134     stopper = TextHandlerManager::createStopper(stopwords);
00135   }
00139   bool accept(const DictEntry &entry) const { 
00140     return !(stopper->stopWord(entry.target.c_str())); 
00141   }
00142 private:
00143   Stopper *stopper;
00144 };
00145 
00147 class DictEntryVector : public vector<DictEntry> {
00148 public:
00149   DictEntryVector() : vector<DictEntry>() {
00150   }
00151   DictEntryVector(char *buffer, DictEntryFilter *filter);
00153   void sortScores() {
00154     sort(this->begin(), this->end(), cmpFn);
00155   }
00160   bool addEntry(DictEntry &entry, double (*compose)(double, double) = NULL);
00161 
00165   bool removeEntry(DictEntry &entry);
00166 
00170   char *toBytes(int &numBytes) const;
00174 
00178   void toBytes(char *buffer) const;
00179 
00180   void fromBytes(char *buffer, DictEntryFilter *filter);
00181 
00184   int numEntries() const;
00185 
00187   void normalize();
00188   
00189 private:
00190   class DictEntryProbDescending { 
00191   public: 
00192     bool operator()(const DictEntry & a, const DictEntry & b) {
00193       return a.prob > b.prob;
00194     }
00195   };
00196   static DictEntryProbDescending cmpFn;
00197 };
00198 
00200 struct dictStats {
00202   int dictSize;
00204   int sourceSize;
00206   int targetSize;
00207 };
00208 
00212 class PDict {
00213 public:
00215   PDict();
00216   
00218   ~PDict();
00219 
00225   DictEntryVector *getTranslations(const string &term, 
00226                                    DictEntryFilter *filter=NULL) const ;
00231   int numTranslations(const string &term, 
00232                       DictEntryFilter *filter=NULL) const;
00235   int getNumPairs() const;
00236 
00239   int getSourceCount() const;
00240 
00243   int getTargetCount() const ;
00244 
00247   const string &getName() const {return name;}
00248 
00251   bool isUsingCounts() const {return usingCounts;}
00252   
00255   void setUsingCounts(bool val) {usingCounts = val;}
00256   
00262   void add(const string &source, DictEntry &value, 
00263            double (*compose)(double, double) = NULL);
00264 
00268   void remove(const string &source, DictEntry &value);
00269 
00272   void remove(const string &source);
00273 
00278   void write(const string &outputName, const string &delim);
00279 
00286   bool read(const string &dictName, const string &delim, bool counts = false);
00287 
00292   bool open(const string &dictName);
00293 
00298   bool create(const string &dictName);
00299 
00302   void close();
00303   
00306   void normalize();
00307 
00309   void startIteration() {dict.setFirst();}
00310 
00315   DictEntryVector *nextTranslations(string &term, 
00316                                    DictEntryFilter *filter=NULL) const;
00317 
00318 private:
00320   void writeTOC() const;  
00322   bool contains(const string &term, Keyfile &keyfile) const;
00324   void flush();
00326   dictStats stats;
00328   DictEntryVector* currentVec;
00330   bool usingCounts;
00332   string currentTerm;
00334   string name;
00336   mutable Keyfile dict;
00338   mutable Keyfile targetIDs;
00340   mutable File dictEntries;
00341 };
00342 
00343 #endif // _LEMUR_PDICT_HPP

Generated on Wed Nov 3 12:59:01 2004 for Lemur Toolkit by doxygen1.2.18