Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

BasicSumm.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2002 Carnegie Mellon University.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 #include <iomanip>
00013 #include "Summarizer.hpp"
00014 #include "Passage.hpp"
00015 #include "BasicPassage.hpp"
00016 #include "InvFPIndex.hpp"
00017 #include <algorithm>
00018 #include <vector>
00019 
00020 using std::vector;
00021 
00022 #if (defined(WIN32) && !defined(min))
00023 #define min(x,y) __min(x,y)
00024 #endif
00025 
00026 #ifndef _BASICSUMM_HPP
00027 #define _BASICSUMM_HPP
00028 
00029 //#define EOS      "*eos"
00030 static const string EOS("*eos");
00031 
00032 #define PSG_LEN  15
00033 
00038 class BasicSumm : public Summarizer {
00039 
00040 private:
00041   const InvFPIndex* idx;
00042   int summLen;
00043   vector<BasicPassage> doc;
00044   mutable int iterCount;
00045 
00046 public:
00048   BasicSumm(const InvFPIndex* inIdx, int inSummLen = 5) {
00049     idx = inIdx;
00050     summLen = inSummLen;
00051     iterCount = 1;
00052   };
00053 
00054   virtual void summDocument(const string &docID, const int optLen, const string &qInfo);
00055 
00056   virtual void scorePassages(const string &qInfo);
00057 
00058   virtual void markPassages(int optLen, const string &qInfo);
00059 
00060   virtual void addPassage(Passage &psg);
00061 
00062   virtual void clear(void);
00063 
00064   virtual int fetchPassages(Passage* psgs, int optLen) const;
00065 
00066   virtual int nextPassage(Passage* psg) const;
00067 
00068   virtual void iterClear(void) const ;
00069 
00070   virtual void outputSumm(void) const ;
00071 
00073   int isEOS(const string &check) {
00074     //    return !strcmp(check, EOS);
00075     return (check  == EOS);
00076   }
00077 
00079   int hasEOS(const InvFPIndex* idx, const TermInfoList* tList) {
00080     tList->startIteration();
00081     TermInfo* tEntry;
00082     while (tList->hasMore()) {
00083       tEntry = tList->nextEntry();
00084       if ( isEOS(idx->term(tEntry->termID())) ) return true;
00085     }
00086     return false;
00087   }
00088 
00090   double scorePassage(BasicPassage &psg, const string &qInfo) {
00091     const string &docID = psg.docID;
00092     passageVec psgV= *psg.getAsVector();
00093     double psgLen = psgV.size();
00094     double P = 1;  // no markup yet, all get same weight
00095     double M = 1.5;
00096     double endScore, Tf, tf, idf, docLen, avgDocLen;
00097     endScore = 0.0;
00098     for (int i=0; i < psgLen; i++) {
00099       docLen = idx->docLength(idx->document(docID));
00100       avgDocLen = idx->docLengthAvg();
00101       tf = psgV[i].tf;
00102       Tf = tf / (tf + 0.5 + 1.5 * (docLen/avgDocLen) );
00103       idf = min(M, log((double)idx->docCount()/(double)idx->docCount(psgV[i].termID))); 
00104       endScore += (Tf * idf * P);
00105     }
00106     endScore = endScore / 1+psgLen;
00107     psg.score = endScore;
00108     return endScore;
00109   }
00110 
00112   void findNextPassage(BasicPassage &psg, const InvFPIndex* idx, 
00113                        const TermInfoList* tList, int eos) {
00114     TermInfo* tEntry;
00115     psg.clear();
00116     termCount* storage;
00117     if (eos) {
00118       while (tList->hasMore()) {
00119         tEntry = tList->nextEntry();
00120         if ( isEOS(idx->term(tEntry->termID())) ) return;
00121         storage = new termCount;
00122         storage->termID = tEntry->termID();
00123         storage->tf = tEntry->count();
00124         psg.addTerm(*storage);
00125       }
00126     } else {
00127       for(int i=0; i < PSG_LEN; i++) {
00128         if (tList->hasMore()) {
00129           tEntry = tList->nextEntry();
00130           storage = new termCount;
00131           storage->termID = tEntry->termID();
00132           storage->tf = tEntry->count();
00133           psg.addTerm(*storage);
00134         } else {
00135           return;
00136         }
00137       }
00138     }
00139     return;
00140   }
00141  
00143   void showPassage(const passageVec* psg, const InvFPIndex* idx) const {
00144     for (int i=0; i < psg->size(); i++) {
00145       cout << idx->term((*psg)[i].termID) << " ";
00146     }
00147   }
00148 
00150   void showMarkedPassages() const {
00151     
00152     for (int i=0; i<doc.size(); i++) {
00153       if (doc[i].marked > 0) {
00154         showPassage(doc[i].getAsVector(), idx);
00155         cout << endl;
00156       }
00157     }
00158   }
00159 
00160 }; // BasicSumm
00161 
00162 #endif

Generated on Wed Nov 3 12:58:51 2004 for Lemur Toolkit by doxygen1.2.18