Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

XLingRetMethod.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  *
00003  *  Original source copyright (c) 2001, Carnegie Mellon University.
00004  *  See copyright.cmu for details.
00005  *  Modifications copyright (c) 2002, University of Massachusetts.
00006  *  See copyright.umass for details.
00007  *
00008  *==========================================================================
00009 */
00010 
00011 
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014 
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "IndexTypes.hpp"
00020 #include "FreqVector.hpp"
00021 #include "UnigramLM.hpp"
00022 #include "ScoreFunction.hpp"
00023 #include "XLingDocModel.hpp"
00024 #include "TextQueryRep.hpp"
00025 #include "TextQueryRetMethod.hpp"
00026 #include "Counter.hpp"
00027 #include "DocUnigramCounter.hpp"
00028 #include "PDict.hpp"
00029 #include "TextHandlerManager.hpp"
00030 
00031 class XLQueryTerm : public QueryTerm {
00032 public:
00033   XLQueryTerm(TERMID_T tid, double  wt, const char *term, double pge,
00034               PDict &dic, Stemmer *stm = NULL) :
00035     QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00036     stemmer(stm) {
00037   }
00038 
00039   XLQueryTerm(const char *term, PDict &dic, Stemmer *stm = NULL) : 
00040     QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00041   }
00042 
00043   XLQueryTerm(const XLQueryTerm &other) : QueryTerm(0,0), dict(other.dict) {
00044     ti = other.ti;
00045     w = other.w;
00046     p_s_ge = other.p_s_ge;
00047     source = other.source;
00048     stemmer = other.stemmer;
00049   }
00050 
00051 
00052   virtual ~XLQueryTerm() { }
00053 
00055   const string &getSource() const {return source;}
00056 
00058   const double getP_s_GE() const {
00059     return p_s_ge;
00060   }
00061   
00063   void setWeight(double wt) {
00064     w = wt;
00065   }
00066 
00068   void incWeight(double wt) {
00069     w += wt;
00070   }
00072   virtual bool operator==(const XLQueryTerm& other) const {
00073     return (other.source == source);
00074   }
00075 
00077   virtual XLQueryTerm& operator=(const XLQueryTerm& other)  {
00078     ti = other.ti;
00079     w = other.w;
00080     p_s_ge = other.p_s_ge;
00081     source = other.source;
00082     dict = other.dict;
00083     stemmer = other.stemmer;
00084     return (*this);
00085   }
00090   DictEntryVector *getTranslations() const {
00091     DictEntryVector *xlates = dict.getTranslations(source);
00092     // If no xlates, Leah's version stems the term and tries again.
00093     if (xlates == NULL && stemmer != NULL) {
00094       // porter stemmer is destructive
00095       char tmpTerm[512];
00096       strcpy(tmpTerm, source.c_str());
00097       string stem = stemmer->stemWord(tmpTerm);
00098       cerr << "getTranslations: stemming " << source << " to " << stem 
00099            << endl;
00100       xlates = dict.getTranslations(stem);
00101     }
00102     return xlates;
00103   }
00104 private:
00105   string source;
00106   double p_s_ge;
00107   PDict &dict;  
00108   Stemmer *stemmer;
00109 };
00110 
00111 
00113 class XLingQueryModel : public QueryRep {
00114 public:
00123   XLingQueryModel(const TermQuery &qry, const Index &source, 
00124                   bool dbS, double numSource,
00125                   PDict &dict, const Stopper *stp = NULL, 
00126                   Stemmer *stm = NULL) {
00127     // fill in weighted terms
00128     // P(e|GE)
00129     double pge;
00130     numTerms = 0;
00131     
00132     qry.startTermIteration();
00133     while (qry.hasMore()) {
00134       const Term *t = qry.nextTerm();
00135       // if Stopper is not NULL, test for stopwords.
00136       if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00137         numTerms++;
00138         XLQueryTerm st(t->spelling(), dict, stm);
00139         iter = find(qTerms.begin(), qTerms.end(), st);
00140         if (iter != qTerms.end()) {
00141           // found it, bump count
00142           (*iter).incWeight(1);
00143         } else {
00144           // new term
00145           TERMID_T ti = source.term(t->spelling());
00146           if (ti>0) {
00147             // pge
00148             if (dbS) {
00149               pge = source.docCount(ti)/numSource;
00150             } else {
00151               pge = (source.termCount(ti)/numSource);      
00152             }
00153           } else {
00154             // OOV, use default pge
00155             // perhaps this would be better estimated with:
00156             //    pge = 1/(numSource + 1);
00157             pge = (0.000001*0.000001);
00158           }
00159           XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00160           qTerms.push_back(newTerm);
00161         }
00162       } else {
00163         cerr << "XLingQueryModel: " << t->spelling() 
00164              << " on stoplist, ignoring" << endl;
00165       }
00166       
00167     }
00168   }
00169   
00170   virtual ~XLingQueryModel() {
00171   }
00172 
00174   virtual void startIteration() const {
00175     iter = qTerms.begin();
00176   }
00178   virtual bool hasMore() const {
00179     return (iter != qTerms.end());
00180   }
00182   //  virtual XLQueryTerm &nextTerm() {
00183   virtual XLQueryTerm &nextTerm() const {
00184     return (*iter++);
00185   }
00186   virtual int getNumTerms() const {return numTerms;}
00187   
00188 private:
00189   mutable vector<XLQueryTerm> qTerms;
00190   mutable vector<XLQueryTerm>::iterator iter;
00191   int numTerms;
00192 };
00193 
00194 // Should not really be a TextQueryRetMethod, as it does not score
00195 // in a like fashion. but does take advantage of the cached doc reps.
00196 //
00203 //class XLingRetMethod : public TextQueryRetMethod {
00204 class XLingRetMethod : public RetrievalMethod {
00205 public:
00206 
00219   XLingRetMethod(const Index &dbIndex, const Index &background, 
00220                  PDict &dict, ScoreAccumulator &accumulator, 
00221                  double l, double b, bool cacheDR,
00222                  string &sBM, string &tBM, 
00223                  const Stopper *stp = NULL, Stemmer *stm = NULL);
00225   virtual ~XLingRetMethod();
00226   
00230   virtual DocumentRep *computeDocRep(DOCID_T docID);
00231 
00238   virtual double matchedTermWeight(TERMID_T id, double weight,
00239                                    const DocInfo *info, 
00240                                    const DocumentRep *dRep) const { 
00241     double d = dRep->termWeight(id,info); //P(a|D)
00242     double score = d * weight; //P(a|D) * P(e|a)
00243     return score;
00244   }
00245 
00250   virtual double adjustedScore(double origScore, double pge) const {
00251     return (log((lambda * origScore) + ((1 - lambda) * pge)));
00252   }
00253 
00254   virtual void scoreCollection(const QueryRep &qry, 
00255                                IndexedRealVector &results){
00256     scoreInvertedIndex(qry, results);
00257   }
00258   // Override (have to do individual doc ones too.
00259   virtual void scoreInvertedIndex(const QueryRep &qryRep, 
00260                                   IndexedRealVector &scores, 
00261                                   bool scoreAll = false);
00262 
00263   virtual QueryRep *computeQueryRep(const Query &qry) {
00264     if (const TermQuery *q = dynamic_cast<const TermQuery *>(&qry))
00265       return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00266                                   dictionary, stopper, stemmer));
00267     else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00268   } 
00269 
00270   virtual QueryRep *computeTargetKLRep(const QueryRep *qry);
00271 
00273   virtual double scoreDoc(const QueryRep &qry, DOCID_T docID);
00274 
00276   virtual void updateQuery(QueryRep &qryRep, const DocIDSet &relDocs) {}
00277 
00278 protected:
00279   virtual double scoreDocVector(const XLingQueryModel &qRep, DOCID_T docID, 
00280                                 FreqVector &docVector);
00281 
00282   double lambda;
00283   double beta;
00284   double numSource;
00285   double numTarget;
00286   bool docBasedSourceSmooth;
00287   bool docBasedTargetSmooth;
00288   ScoreAccumulator &scAcc; // this does not need to be passed in. Bleah.
00289   PDict &dictionary;
00290   Stemmer *stemmer; // source language
00291   const Stopper *stopper; // source language
00292   const Index &source;
00294   DocumentRep **docReps;
00296   bool cacheDocReps;
00298   int docRepsSize;
00299   ScoreAccumulator *termScores;
00300 };
00301 
00302 #endif /* _XLINGRETMETHOD_HPP */

Generated on Wed Nov 3 12:59:08 2004 for Lemur Toolkit by doxygen1.2.18