00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <iomanip>
00013 #include "Summarizer.hpp"
00014 #include "Passage.hpp"
00015 #include "BasicPassage.hpp"
00016 #include "InvFPIndex.hpp"
00017 #include <algorithm>
00018 #include <vector>
00019
00020 using std::vector;
00021
00022 #if (defined(WIN32) && !defined(min))
00023 #define min(x,y) __min(x,y)
00024 #endif
00025
00026 #ifndef _BASICSUMM_HPP
00027 #define _BASICSUMM_HPP
00028
00029
00030 static const string EOS("*eos");
00031
00032 #define PSG_LEN 15
00033
00038 class BasicSumm : public Summarizer {
00039
00040 private:
00041 const InvFPIndex* idx;
00042 int summLen;
00043 vector<BasicPassage> doc;
00044 mutable int iterCount;
00045
00046 public:
00048 BasicSumm(const InvFPIndex* inIdx, int inSummLen = 5) {
00049 idx = inIdx;
00050 summLen = inSummLen;
00051 iterCount = 1;
00052 };
00053
00054 virtual void summDocument(const string &docID, const int optLen, const string &qInfo);
00055
00056 virtual void scorePassages(const string &qInfo);
00057
00058 virtual void markPassages(int optLen, const string &qInfo);
00059
00060 virtual void addPassage(Passage &psg);
00061
00062 virtual void clear(void);
00063
00064 virtual int fetchPassages(Passage* psgs, int optLen) const;
00065
00066 virtual int nextPassage(Passage* psg) const;
00067
00068 virtual void iterClear(void) const ;
00069
00070 virtual void outputSumm(void) const ;
00071
00073 int isEOS(const string &check) {
00074
00075 return (check == EOS);
00076 }
00077
00079 int hasEOS(const InvFPIndex* idx, const TermInfoList* tList) {
00080 tList->startIteration();
00081 TermInfo* tEntry;
00082 while (tList->hasMore()) {
00083 tEntry = tList->nextEntry();
00084 if ( isEOS(idx->term(tEntry->termID())) ) return true;
00085 }
00086 return false;
00087 }
00088
00090 double scorePassage(BasicPassage &psg, const string &qInfo) {
00091 const string &docID = psg.docID;
00092 passageVec psgV= *psg.getAsVector();
00093 double psgLen = psgV.size();
00094 double P = 1;
00095 double M = 1.5;
00096 double endScore, Tf, tf, idf, docLen, avgDocLen;
00097 endScore = 0.0;
00098 for (int i=0; i < psgLen; i++) {
00099 docLen = idx->docLength(idx->document(docID));
00100 avgDocLen = idx->docLengthAvg();
00101 tf = psgV[i].tf;
00102 Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00103 idf = min(M, log((double)idx->docCount()/(double)idx->docCount(psgV[i].termID)));
00104 endScore += (Tf * idf * P);
00105 }
00106 endScore = endScore / 1+psgLen;
00107 psg.score = endScore;
00108 return endScore;
00109 }
00110
00112 void findNextPassage(BasicPassage &psg, const InvFPIndex* idx,
00113 const TermInfoList* tList, int eos) {
00114 TermInfo* tEntry;
00115 psg.clear();
00116 termCount* storage;
00117 if (eos) {
00118 while (tList->hasMore()) {
00119 tEntry = tList->nextEntry();
00120 if ( isEOS(idx->term(tEntry->termID())) ) return;
00121 storage = new termCount;
00122 storage->termID = tEntry->termID();
00123 storage->tf = tEntry->count();
00124 psg.addTerm(*storage);
00125 }
00126 } else {
00127 for(int i=0; i < PSG_LEN; i++) {
00128 if (tList->hasMore()) {
00129 tEntry = tList->nextEntry();
00130 storage = new termCount;
00131 storage->termID = tEntry->termID();
00132 storage->tf = tEntry->count();
00133 psg.addTerm(*storage);
00134 } else {
00135 return;
00136 }
00137 }
00138 }
00139 return;
00140 }
00141
00143 void showPassage(const passageVec* psg, const InvFPIndex* idx) const {
00144 for (int i=0; i < psg->size(); i++) {
00145 cout << idx->term((*psg)[i].termID) << " ";
00146 }
00147 }
00148
00150 void showMarkedPassages() const {
00151
00152 for (int i=0; i<doc.size(); i++) {
00153 if (doc[i].marked > 0) {
00154 showPassage(doc[i].getAsVector(), idx);
00155 cout << endl;
00156 }
00157 }
00158 }
00159
00160 };
00161
00162 #endif