00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _XLINGRETMETHOD_HPP
00013 #define _XLINGRETMETHOD_HPP
00014
00015 #include "common_headers.hpp"
00016 #include <cmath>
00017 #include <vector>
00018 #include <algorithm>
00019 #include "IndexTypes.hpp"
00020 #include "FreqVector.hpp"
00021 #include "UnigramLM.hpp"
00022 #include "ScoreFunction.hpp"
00023 #include "XLingDocModel.hpp"
00024 #include "TextQueryRep.hpp"
00025 #include "TextQueryRetMethod.hpp"
00026 #include "Counter.hpp"
00027 #include "DocUnigramCounter.hpp"
00028 #include "PDict.hpp"
00029 #include "TextHandlerManager.hpp"
00030
00031 class XLQueryTerm : public QueryTerm {
00032 public:
00033 XLQueryTerm(TERMID_T tid, double wt, const char *term, double pge,
00034 PDict &dic, Stemmer *stm = NULL) :
00035 QueryTerm(tid, wt), source(term), p_s_ge(pge), dict(dic),
00036 stemmer(stm) {
00037 }
00038
00039 XLQueryTerm(const char *term, PDict &dic, Stemmer *stm = NULL) :
00040 QueryTerm(0, 0), source(term), p_s_ge(0), dict(dic), stemmer(stm) {
00041 }
00042
00043 XLQueryTerm(const XLQueryTerm &other) : QueryTerm(0,0), dict(other.dict) {
00044 ti = other.ti;
00045 w = other.w;
00046 p_s_ge = other.p_s_ge;
00047 source = other.source;
00048 stemmer = other.stemmer;
00049 }
00050
00051
00052 virtual ~XLQueryTerm() { }
00053
00055 const string &getSource() const {return source;}
00056
00058 const double getP_s_GE() const {
00059 return p_s_ge;
00060 }
00061
00063 void setWeight(double wt) {
00064 w = wt;
00065 }
00066
00068 void incWeight(double wt) {
00069 w += wt;
00070 }
00072 virtual bool operator==(const XLQueryTerm& other) const {
00073 return (other.source == source);
00074 }
00075
00077 virtual XLQueryTerm& operator=(const XLQueryTerm& other) {
00078 ti = other.ti;
00079 w = other.w;
00080 p_s_ge = other.p_s_ge;
00081 source = other.source;
00082 dict = other.dict;
00083 stemmer = other.stemmer;
00084 return (*this);
00085 }
00090 DictEntryVector *getTranslations() const {
00091 DictEntryVector *xlates = dict.getTranslations(source);
00092
00093 if (xlates == NULL && stemmer != NULL) {
00094
00095 char tmpTerm[512];
00096 strcpy(tmpTerm, source.c_str());
00097 string stem = stemmer->stemWord(tmpTerm);
00098 cerr << "getTranslations: stemming " << source << " to " << stem
00099 << endl;
00100 xlates = dict.getTranslations(stem);
00101 }
00102 return xlates;
00103 }
00104 private:
00105 string source;
00106 double p_s_ge;
00107 PDict &dict;
00108 Stemmer *stemmer;
00109 };
00110
00111
00113 class XLingQueryModel : public QueryRep {
00114 public:
00123 XLingQueryModel(const TermQuery &qry, const Index &source,
00124 bool dbS, double numSource,
00125 PDict &dict, const Stopper *stp = NULL,
00126 Stemmer *stm = NULL) {
00127
00128
00129 double pge;
00130 numTerms = 0;
00131
00132 qry.startTermIteration();
00133 while (qry.hasMore()) {
00134 const Term *t = qry.nextTerm();
00135
00136 if (stp == NULL || !(stp->stopWord(t->spelling()))) {
00137 numTerms++;
00138 XLQueryTerm st(t->spelling(), dict, stm);
00139 iter = find(qTerms.begin(), qTerms.end(), st);
00140 if (iter != qTerms.end()) {
00141
00142 (*iter).incWeight(1);
00143 } else {
00144
00145 TERMID_T ti = source.term(t->spelling());
00146 if (ti>0) {
00147
00148 if (dbS) {
00149 pge = source.docCount(ti)/numSource;
00150 } else {
00151 pge = (source.termCount(ti)/numSource);
00152 }
00153 } else {
00154
00155
00156
00157 pge = (0.000001*0.000001);
00158 }
00159 XLQueryTerm newTerm(ti, 1, t->spelling(), pge, dict, stm);
00160 qTerms.push_back(newTerm);
00161 }
00162 } else {
00163 cerr << "XLingQueryModel: " << t->spelling()
00164 << " on stoplist, ignoring" << endl;
00165 }
00166
00167 }
00168 }
00169
00170 virtual ~XLingQueryModel() {
00171 }
00172
00174 virtual void startIteration() const {
00175 iter = qTerms.begin();
00176 }
00178 virtual bool hasMore() const {
00179 return (iter != qTerms.end());
00180 }
00182
00183 virtual XLQueryTerm &nextTerm() const {
00184 return (*iter++);
00185 }
00186 virtual int getNumTerms() const {return numTerms;}
00187
00188 private:
00189 mutable vector<XLQueryTerm> qTerms;
00190 mutable vector<XLQueryTerm>::iterator iter;
00191 int numTerms;
00192 };
00193
00194
00195
00196
00203
00204 class XLingRetMethod : public RetrievalMethod {
00205 public:
00206
00219 XLingRetMethod(const Index &dbIndex, const Index &background,
00220 PDict &dict, ScoreAccumulator &accumulator,
00221 double l, double b, bool cacheDR,
00222 string &sBM, string &tBM,
00223 const Stopper *stp = NULL, Stemmer *stm = NULL);
00225 virtual ~XLingRetMethod();
00226
00230 virtual DocumentRep *computeDocRep(DOCID_T docID);
00231
00238 virtual double matchedTermWeight(TERMID_T id, double weight,
00239 const DocInfo *info,
00240 const DocumentRep *dRep) const {
00241 double d = dRep->termWeight(id,info);
00242 double score = d * weight;
00243 return score;
00244 }
00245
00250 virtual double adjustedScore(double origScore, double pge) const {
00251 return (log((lambda * origScore) + ((1 - lambda) * pge)));
00252 }
00253
00254 virtual void scoreCollection(const QueryRep &qry,
00255 IndexedRealVector &results){
00256 scoreInvertedIndex(qry, results);
00257 }
00258
00259 virtual void scoreInvertedIndex(const QueryRep &qryRep,
00260 IndexedRealVector &scores,
00261 bool scoreAll = false);
00262
00263 virtual QueryRep *computeQueryRep(const Query &qry) {
00264 if (const TermQuery *q = dynamic_cast<const TermQuery *>(&qry))
00265 return (new XLingQueryModel(*q, source, docBasedSourceSmooth, numSource,
00266 dictionary, stopper, stemmer));
00267 else LEMUR_THROW(LEMUR_RUNTIME_ERROR, "XLingRetMethod expects query of type TermQuery");
00268 }
00269
00270 virtual QueryRep *computeTargetKLRep(const QueryRep *qry);
00271
00273 virtual double scoreDoc(const QueryRep &qry, DOCID_T docID);
00274
00276 virtual void updateQuery(QueryRep &qryRep, const DocIDSet &relDocs) {}
00277
00278 protected:
00279 virtual double scoreDocVector(const XLingQueryModel &qRep, DOCID_T docID,
00280 FreqVector &docVector);
00281
00282 double lambda;
00283 double beta;
00284 double numSource;
00285 double numTarget;
00286 bool docBasedSourceSmooth;
00287 bool docBasedTargetSmooth;
00288 ScoreAccumulator &scAcc;
00289 PDict &dictionary;
00290 Stemmer *stemmer;
00291 const Stopper *stopper;
00292 const Index &source;
00294 DocumentRep **docReps;
00296 bool cacheDocReps;
00298 int docRepsSize;
00299 ScoreAccumulator *termScores;
00300 };
00301
00302 #endif