ContextSimpleCountCollectorCopier.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // ContextSimpleCountCollectorCopier
00015 //
00016 // 5 March 2004 -- tds
00017 //
00018 // This copier uses a IndriIndex to extract context
00019 // counts for certain simple subgraphs.  It can compute
00020 // counts for the following types of expressions:
00021 //
00022 //   dog
00023 //   <dog cat>
00024 //   dog.title
00025 //   <dog cat>.title
00026 //   dog.(title)
00027 //   <dog cat>.(title)
00028 //   
00029 // Notably, it is unable to compute counts when more than
00030 // one field is involved.
00031 //
00032 
00033 #ifndef INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00034 #define INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00035 
00036 #include "indri/QuerySpec.hpp"
00037 #include "indri/Copier.hpp"
00038 #include "indri/delete_range.hpp"
00039 #include "indri/Repository.hpp"
00040 #include "indri/IndriIndex.hpp"
00041 #include "indri/ListCache.hpp"
00042 
00043 class ContextSimpleCountCollectorCopier : public indri::lang::Copier {
00044 private:
00045   std::vector<indri::lang::Node*> _newNodes;
00046   Repository& _repository;
00047   IndriIndex& _index;
00048   ListCache& _cache;
00049 
00050   class SubtreeWalker : public indri::lang::Walker {
00051   private:
00052     bool _computable;
00053     int _extentOrs;
00054     bool _hasContext;
00055 
00056     std::vector<indri::lang::IndexTerm*> _terms;
00057     indri::lang::Field* _field;
00058 
00059   public:
00060     SubtreeWalker() :
00061       _computable(true),
00062       _extentOrs(0),
00063       _field(0)
00064     {
00065     }
00066 
00067     bool isComputable() {
00068       return _computable && _terms.size();
00069     }
00070 
00071     std::vector<indri::lang::IndexTerm*>& getTerms() {
00072       return _terms;
00073     }
00074 
00075     indri::lang::Field* getField() {
00076       return _field;
00077     }
00078 
00079     bool hasContext() const {
00080       return _hasContext;
00081     }
00082 
00083     void defaultBefore( indri::lang::Node* node ) {
00084       // this means that we're seeing some node type that
00085       // we aren't otherwise trapping--that means this subtree
00086       // is surely not precomputable
00087       _computable = false;
00088     }
00089 
00090     void before( indri::lang::ContextCounterNode* contextNode ) {
00091       // if the context node has a context, then it must have a field in the context
00092       // if we find more than one field, we say this isn't computable.  Therefore, if
00093       // this subtree is computable and it has a context, the single field must be in the context.
00094       _hasContext = contextNode->getContext() ? true : false;
00095     }
00096     
00097     void before( indri::lang::ExtentOr* extentOrNode ) {
00098       _extentOrs++;
00099     }
00100 
00101     void after( indri::lang::ExtentOr* extentOrNode ) {
00102       _extentOrs--;
00103     }
00104 
00105     void before( indri::lang::ExtentAnd* extentAndNode ) {
00106       // we definitely can't deal with any "true" extentAnds
00107       // however, if this is just an and wrapper around a single
00108       // field, we won't let it fool us
00109       if( extentAndNode->getChildren().size() > 1 )
00110         _computable = false;
00111     }
00112 
00113     void before( indri::lang::Field* fieldNode ) {
00114       if( _extentOrs || _field ) {
00115         // fields can't be or-ed together; only terms can (_extentOr)
00116         // If we already saw a field, then this one proves that the tree isn't computable (_field)
00117         _computable = false;
00118       }
00119 
00120       _field = fieldNode;
00121     }
00122 
00123     void before( indri::lang::IndexTerm* termNode ) {
00124       _terms.push_back(termNode);
00125     }
00126 
00127     void before( indri::lang::ExtentInside* insideNode ) {
00128       // ignore this; the other checks should catch any bad trees
00129       // without having to worry about checking here
00130     }
00131   };
00132 
00133   void _computeRestrictedCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00134     UINT64 occurrences = 0;
00135     UINT64 size;
00136 
00137     assert( subtree.isComputable() );
00138     assert( subtree.getField() );
00139 
00140     // get the fieldID
00141     int fieldID = _index.field( subtree.getField()->getFieldName().c_str() );
00142 
00143     if( subtree.hasContext() ) {
00144       size = _index.fieldTermCount( fieldID );
00145     } else {
00146       size = _index.termCount();
00147     }
00148 
00149     std::vector<indri::lang::IndexTerm*>& terms = subtree.getTerms();
00150     for( unsigned int i=0; i<terms.size(); i++ ) {
00151       std::string processed = terms[i]->getText();
00152       if( terms[i]->getStemmed() == false )
00153         processed = _repository.processTerm( terms[i]->getText() );
00154 
00155       if( processed.length() != 0 ) {
00156       int termID = _index.term( processed.c_str() );
00157       occurrences += _index.fieldTermCount( fieldID, termID );
00158     }
00159     }
00160 
00161     contextNode->setCounts( occurrences, size );
00162   }
00163 
00164   void _computeUnrestrictedCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00165     assert( subtree.isComputable() );
00166     assert( !subtree.getField() );
00167 
00168     UINT64 occurrences = 0;
00169     UINT64 size = 0;
00170 
00171     UINT64 maxOccurrences = 0;
00172     UINT64 maxContextSize = 0;
00173     UINT64 minContextSize = MAX_INT64;
00174     double maxDocumentFraction = 0;
00175 
00176     std::vector<indri::lang::IndexTerm*>& terms = subtree.getTerms();
00177     for( unsigned int i=0; i<terms.size(); i++ ) {
00178       std::string processed = terms[i]->getText();
00179       if( terms[i]->getStemmed() == false )
00180         processed = _repository.processTerm( terms[i]->getText() );
00181 
00182       if( processed.length() != 0 ) {
00183       int termID = _index.term( processed.c_str() );
00184       occurrences += _index.termCount( termID );
00185 
00186       maxOccurrences += _index.termMaxDocumentFrequency( termID );
00187       minContextSize = lemur_compat::min<UINT64>( _index.termMinDocumentLength( termID ), minContextSize );
00188 
00189       maxDocumentFraction += _index.termMaxDocumentFraction( termID );
00190       maxDocumentFraction = lemur_compat::min<double>( 1.0, maxDocumentFraction );
00191       }
00192     }
00193 
00194     maxContextSize = _index.maxDocumentLength();
00195 
00196     size = _index.termCount();
00197     contextNode->setCounts( occurrences, size, maxOccurrences, minContextSize, maxContextSize, maxDocumentFraction );
00198   }
00199 
00200   void _computeCounts( indri::lang::ContextCounterNode* contextNode, SubtreeWalker& subtree ) {
00201     assert( subtree.isComputable() );
00202 
00203     if( subtree.getField() ) {
00204       _computeRestrictedCounts( contextNode, subtree );
00205     } else {
00206       _computeUnrestrictedCounts( contextNode, subtree );
00207     }
00208   }
00209 
00210   void _setCountsFromList( indri::lang::ContextCounterNode* contextNode, ListCache::CachedList* list ) {
00211     contextNode->setCounts( list->occurrences,
00212                             list->contextSize, 
00213                             list->maximumOccurrences,
00214                             list->minimumContextSize, 
00215                             list->maximumContextSize,
00216                             list->maximumContextFraction );
00217     contextNode->setRawExtent( 0 );
00218     contextNode->setContext( 0 );
00219   }
00220 
00221 public:
00222   ContextSimpleCountCollectorCopier( Repository& repository, ListCache& cache ) :
00223     _repository(repository),
00224     _index(*repository.index()),
00225     _cache(cache)
00226   {
00227   }
00228 
00229   ~ContextSimpleCountCollectorCopier() {
00230     delete_vector_contents<indri::lang::Node*>( _newNodes );
00231   }
00232 
00233   indri::lang::Node* defaultAfter( indri::lang::Node* oldNode, indri::lang::Node* newNode ) {
00234     _newNodes.push_back( newNode );
00235     return newNode;
00236   }
00237 
00238   indri::lang::Node* after( indri::lang::ContextCounterNode* contextNode, indri::lang::ContextCounterNode* newNode ) {
00239     ListCache::CachedList* list = _cache.find( newNode->getRawExtent(), newNode->getContext() );
00240 
00241     if( list ) {
00242       _setCountsFromList( newNode, list );
00243     } else {
00244       // first, walk the subtree to find out if it's computable
00245       SubtreeWalker subtree;
00246       contextNode->walk(subtree);
00247     
00248       if( subtree.isComputable() ) {
00249         // compute it, then cut off the subtree
00250         _computeCounts( newNode, subtree );
00251         newNode->setRawExtent( 0 );
00252         newNode->setContext( 0 );
00253       } else if( newNode->getContext() == 0 ) {
00254         // we can't figure out how many term occurrences
00255         // there are in this thing, but at least we know
00256         // how big the context size is, so set it
00257         newNode->setContextSize( _index.termCount() );
00258       }
00259     }
00260 
00261     // if it wasn't computable, keep the subtree around so the
00262     // inference network code can run it and figure out the counts
00263     _newNodes.push_back( newNode );
00264     return newNode;
00265   }
00266 };
00267 
00268 #endif // INDRI_CONTEXTSIMPLECOUNTCOLLECTORCOPIER_HPP
00269