00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033 #ifndef INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00034 #define INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00035
00036 #include "indri/QuerySpec.hpp"
00037 #include "indri/Copier.hpp"
00038 #include "indri/delete_range.hpp"
00039 #include "indri/Repository.hpp"
00040 #include "indri/IndriIndex.hpp"
00041 #include "indri/ListCache.hpp"
00042
00043 class ContextSimpleCountCollectorCopier : public indri::lang::Copier {
00044 private:
00045 std::vector<indri::lang::Node*> _newNodes;
00046 Repository& _repository;
00047 IndriIndex& _index;
00048 ListCache& _cache;
00049
00050 class SubtreeWalker : public indri::lang::Walker {
00051 private:
00052 bool _computable;
00053 int _extentOrs;
00054 bool _hasContext;
00055
00056 std::vector<indri::lang::IndexTerm*> _terms;
00057 indri::lang::Field* _field;
00058
00059 public:
00060 SubtreeWalker() :
00061 _computable(true),
00062 _extentOrs(0),
00063 _field(0)
00064 {
00065 }
00066
00067 bool isComputable() {
00068 return _computable && _terms.size();
00069 }
00070
00071 std::vector<indri::lang::IndexTerm*>& getTerms() {
00072 return _terms;
00073 }
00074
00075 indri::lang::Field* getField() {
00076 return _field;
00077 }
00078
00079 bool hasContext() const {
00080 return _hasContext;
00081 }
00082
00083 void defaultBefore( indri::lang::Node* node ) {
00084
00085
00086
00087 _computable = false;
00088 }
00089
00090 void before( indri::lang::ContextCounterNode* contextNode ) {
00091
00092
00093
00094 _hasContext = contextNode->getContext() ? true : false;
00095 }
00096
00097 void before( indri::lang::ExtentOr* extentOrNode ) {
00098 _extentOrs++;
00099 }
00100
00101 void after( indri::lang::ExtentOr* extentOrNode ) {
00102 _extentOrs--;
00103 }
00104
00105 void before( indri::lang::ExtentAnd* extentAndNode ) {
00106
00107
00108
00109 if( extentAndNode->getChildren().size() > 1 )
00110 _computable = false;
00111 }
00112
00113 void before( indri::lang::Field* fieldNode ) {
00114 if( _extentOrs || _field ) {
00115
00116
00117 _computable = false;
00118 }
00119
00120 _field = fieldNode;
00121 }
00122
00123 void before( indri::lang::IndexTerm* termNode ) {
00124 _terms.push_back(termNode);
00125 }
00126
00127 void before( indri::lang::ExtentInside* insideNode ) {
00128
00129
00130 }
00131 };
00132
00133 void _computeRestrictedCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00134 UINT64 occurrences = 0;
00135 UINT64 size;
00136
00137 assert( subtree.isComputable() );
00138 assert( subtree.getField() );
00139
00140
00141 int fieldID = _index.field( subtree.getField()->getFieldName().c_str() );
00142
00143 if( subtree.hasContext() ) {
00144 size = _index.fieldTermCount( fieldID );
00145 } else {
00146 size = _index.termCount();
00147 }
00148
00149 std::vector<indri::lang::IndexTerm*>& terms = subtree.getTerms();
00150 for( unsigned int i=0; i<terms.size(); i++ ) {
00151 std::string processed = terms[i]->getText();
00152 if( terms[i]->getStemmed() == false )
00153 processed = _repository.processTerm( terms[i]->getText() );
00154
00155 if( processed.length() != 0 ) {
00156 int termID = _index.term( processed.c_str() );
00157 occurrences += _index.fieldTermCount( fieldID, termID );
00158 }
00159 }
00160
00161 contextNode->setCounts( occurrences, size );
00162 }
00163
00164 void _computeUnrestrictedCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00165 assert( subtree.isComputable() );
00166 assert( !subtree.getField() );
00167
00168 UINT64 occurrences = 0;
00169 UINT64 size = 0;
00170
00171 UINT64 maxOccurrences = 0;
00172 UINT64 maxContextSize = 0;
00173 UINT64 minContextSize = MAX_INT64;
00174 double maxDocumentFraction = 0;
00175
00176 std::vector<indri::lang::IndexTerm*>& terms = subtree.getTerms();
00177 for( unsigned int i=0; i<terms.size(); i++ ) {
00178 std::string processed = terms[i]->getText();
00179 if( terms[i]->getStemmed() == false )
00180 processed = _repository.processTerm( terms[i]->getText() );
00181
00182 if( processed.length() != 0 ) {
00183 int termID = _index.term( processed.c_str() );
00184 occurrences += _index.termCount( termID );
00185
00186 maxOccurrences += _index.termMaxDocumentFrequency( termID );
00187 minContextSize = lemur_compat::min<UINT64>( _index.termMinDocumentLength( termID ), minContextSize );
00188
00189 maxDocumentFraction += _index.termMaxDocumentFraction( termID );
00190 maxDocumentFraction = lemur_compat::min<double>( 1.0, maxDocumentFraction );
00191 }
00192 }
00193
00194 maxContextSize = _index.maxDocumentLength();
00195
00196 size = _index.termCount();
00197 contextNode->setCounts( occurrences, size, maxOccurrences, minContextSize, maxContextSize, maxDocumentFraction );
00198 }
00199
00200 void _computeCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00201 assert( subtree.isComputable() );
00202
00203 if( subtree.getField() ) {
00204 _computeRestrictedCounts( contextNode, subtree );
00205 } else {
00206 _computeUnrestrictedCounts( contextNode, subtree );
00207 }
00208 }
00209
00210 void _setCountsFromList( indri::lang::ContextCounterNode* contextNode, ListCache::CachedList* list ) {
00211 contextNode->setCounts( list->occurrences,
00212 list->contextSize,
00213 list->maximumOccurrences,
00214 list->minimumContextSize,
00215 list->maximumContextSize,
00216 list->maximumContextFraction );
00217 contextNode->setRawExtent( 0 );
00218 contextNode->setContext( 0 );
00219 }
00220
00221 public:
00222 ContextSimpleCountCollectorCopier( Repository& repository, ListCache& cache ) :
00223 _repository(repository),
00224 _index(*repository.index()),
00225 _cache(cache)
00226 {
00227 }
00228
00229 ~ContextSimpleCountCollectorCopier() {
00230 delete_vector_contents<indri::lang::Node*>( _newNodes );
00231 }
00232
00233 indri::lang::Node* defaultAfter( indri::lang::Node* oldNode, indri::lang::Node* newNode ) {
00234 _newNodes.push_back( newNode );
00235 return newNode;
00236 }
00237
00238 indri::lang::Node* after( indri::lang::ContextCounterNode* contextNode, indri::lang::ContextCounterNode* newNode ) {
00239 ListCache::CachedList* list = _cache.find( newNode->getRawExtent(), newNode->getContext() );
00240
00241 if( list ) {
00242 _setCountsFromList( newNode, list );
00243 } else {
00244
00245 SubtreeWalker subtree;
00246 contextNode->walk(subtree);
00247
00248 if( subtree.isComputable() ) {
00249
00250 _computeCounts( newNode, subtree );
00251 newNode->setRawExtent( 0 );
00252 newNode->setContext( 0 );
00253 } else if( newNode->getContext() == 0 ) {
00254
00255
00256
00257 newNode->setContextSize( _index.termCount() );
00258 }
00259 }
00260
00261
00262
00263 _newNodes.push_back( newNode );
00264 return newNode;
00265 }
00266 };
00267
00268 #endif // INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00269