00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef LEMUR_OKAPITERMSCOREFUNCTION_HPP
00020 #define LEMUR_OKAPITERMSCOREFUNCTION_HPP
00021
00022 #include "indri/TermScoreFunction.hpp"
00023 #include "indri/IndriIndex.hpp"
00024 #include <math.h>
00025
00026 class OkapiTermScoreFunction : public TermScoreFunction {
00027 private:
00029 double _inverseDocumentFrequency;
00031 double _averageDocumentLength;
00032
00033 double _termWeight;
00034
00035
00036 double _k1;
00037 double _b;
00038
00039
00040 double _bOverAvgDocLength;
00041 double _k1TimesOneMinusB;
00042 double _idfTimesK1PlusOne;
00043 double _k1TimesBOverAvgDocLength;
00044 double _termWeightTimesIDFTimesK1;
00045
00046 void _precomputeConstants() {
00047 _idfTimesK1PlusOne = _inverseDocumentFrequency * ( _k1 + 1 );
00048 _k1TimesOneMinusB = _k1 * (1-_b);
00049 _bOverAvgDocLength = _b / _averageDocumentLength;
00050 _k1TimesBOverAvgDocLength = _k1 * _bOverAvgDocLength;
00051 _termWeightTimesIDFTimesK1 = _termWeight * _inverseDocumentFrequency * _k1;
00052 }
00053
00054 public:
00055 OkapiTermScoreFunction( IndriIndex& index, int termID, double termWeight, double k1 = 1, double b = 0.5 ) {
00056 double idfNumerator = index.docCount()+1;
00057 double idfDenominator = 0.5+index.docCount( termID );
00058
00059 _inverseDocumentFrequency = log( idfNumerator / idfDenominator );
00060 _averageDocumentLength = index.docLengthAvg();
00061 _termWeight = termWeight;
00062
00063 _k1 = k1;
00064 _b = b;
00065
00066 _precomputeConstants();
00067 }
00068
00069 OkapiTermScoreFunction( double idf, double averageDocumentLength, double k1, double b ) {
00070 _inverseDocumentFrequency = idf;
00071 _averageDocumentLength = averageDocumentLength;
00072
00073 _k1 = k1;
00074 _b = b;
00075
00076 _termWeight = queryTermWeight( 1000, 0 );
00077 _precomputeConstants();
00078 }
00079
00080 double scoreOccurrence( int occurrences, int documentLength ) {
00081 double numerator = _idfTimesK1PlusOne * occurrences;
00082 double denominator = occurrences + _k1TimesOneMinusB + _bOverAvgDocLength * documentLength;
00083
00084 return numerator / denominator;
00085 }
00086
00087 double maximumScore( int maximumOccurrences, int minimumDocumentLength ) {
00088 return scoreOccurrence( maximumOccurrences, minimumDocumentLength );
00089 }
00090
00091 double queryTermWeight( double queryK1, double queryB ) {
00092 return ( _inverseDocumentFrequency * queryK1 ) / ( 1 + queryK1 * ( (1-queryB) + queryB * (1/_averageDocumentLength) ) );
00093 }
00094 };
00095
00096 #endif // LEMUR_OKAPITERMSCOREFUNCTION_HPP
00097