Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

OkapiTermScoreFunction.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // OkapiTermScoreFunction
00015 //
00016 // 3 February 2004 -- tds
00017 //
00018 
00019 #ifndef LEMUR_OKAPITERMSCOREFUNCTION_HPP
00020 #define LEMUR_OKAPITERMSCOREFUNCTION_HPP
00021 
00022 #include "indri/TermScoreFunction.hpp"
00023 #include "indri/IndriIndex.hpp"
00024 #include <math.h>
00025 
00026 class OkapiTermScoreFunction : public TermScoreFunction {
00027 private:
00029   double _inverseDocumentFrequency; 
00031   double _averageDocumentLength;
00032 
00033   double _termWeight;
00034 
00035   // These are BM25 parameters
00036   double _k1;
00037   double _b;
00038 
00039   // The following values are precomputed so that score computation will go faster
00040   double _bOverAvgDocLength;
00041   double _k1TimesOneMinusB;
00042   double _idfTimesK1PlusOne;
00043   double _k1TimesBOverAvgDocLength;
00044   double _termWeightTimesIDFTimesK1;
00045 
00046   void _precomputeConstants() {
00047     _idfTimesK1PlusOne = _inverseDocumentFrequency * ( _k1 + 1 );
00048     _k1TimesOneMinusB = _k1 * (1-_b);
00049     _bOverAvgDocLength = _b / _averageDocumentLength;
00050     _k1TimesBOverAvgDocLength = _k1 * _bOverAvgDocLength;
00051     _termWeightTimesIDFTimesK1 = _termWeight * _inverseDocumentFrequency * _k1;
00052   }
00053 
00054 public:
00055   OkapiTermScoreFunction( IndriIndex& index, int termID, double termWeight, double k1 = 1, double b = 0.5 ) {
00056     double idfNumerator = index.docCount()+1;
00057     double idfDenominator = 0.5+index.docCount( termID );
00058 
00059     _inverseDocumentFrequency = log( idfNumerator / idfDenominator );
00060     _averageDocumentLength = index.docLengthAvg();
00061     _termWeight = termWeight;
00062 
00063     _k1 = k1;
00064     _b = b;
00065 
00066     _precomputeConstants();
00067   }
00068 
00069   OkapiTermScoreFunction( double idf, double averageDocumentLength, double k1, double b ) {
00070     _inverseDocumentFrequency = idf;
00071     _averageDocumentLength = averageDocumentLength;
00072 
00073     _k1 = k1;
00074     _b = b;
00075 
00076     _termWeight = queryTermWeight( 1000, 0 );
00077     _precomputeConstants();
00078   }
00079 
00080   double scoreOccurrence( int occurrences, int documentLength ) {
00081     double numerator = _idfTimesK1PlusOne * occurrences;
00082     double denominator = occurrences + _k1TimesOneMinusB + _bOverAvgDocLength * documentLength;
00083 
00084     return numerator / denominator;
00085   }
00086 
00087   double maximumScore( int maximumOccurrences, int minimumDocumentLength ) {
00088     return scoreOccurrence( maximumOccurrences, minimumDocumentLength );
00089   }
00090 
00091   double queryTermWeight( double queryK1, double queryB ) {
00092     return ( _inverseDocumentFrequency * queryK1 ) / ( 1 + queryK1 * ( (1-queryB) + queryB * (1/_averageDocumentLength) ) );
00093   }
00094 };
00095 
00096 #endif // LEMUR_OKAPITERMSCOREFUNCTION_HPP
00097 

Generated on Wed Nov 3 12:59:00 2004 for Lemur Toolkit by doxygen1.2.18