Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

FieldListIterator.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // KeyfileFieldListIterator
00015 //
00016 // 10 January 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_FIELDLISTITERATOR_HPP
00020 #define INDRI_FIELDLISTITERATOR_HPP
00021 
00022 #include "indri/Extent.hpp"
00023 #include "indri/FieldListDiskBlockReader.hpp"
00024 #include "File.hpp"
00025 #include "ReadBuffer.hpp"
00026 
00027 #define INDRI_FIELDLISTITERATOR_BUFFERSIZE (1024*1024)
00028 
00029 namespace indri {
00030   namespace index {
00031 
00032     struct FieldExtentInfo {
00033       int documentID;
00034       greedy_vector<Extent> extents;
00035       greedy_vector<UINT64> numbers;
00036     };
00037 
00038     class FieldListIterator {
00039     private:
00040       FieldListDiskBlockReader _reader;
00041       FieldExtentInfo _info;
00042       bool _finished;
00043       bool _hasMore;
00044       bool _numeric;
00045 
00046       File& _file;
00047       ReadBuffer _readBuffer;
00048 
00049       bool _fetchNextBlock() {
00050         const char* nextBlock = _readBuffer.read( INDRI_FIELDLIST_BLOCKSIZE );
00051 
00052         if( !nextBlock )
00053           return false;
00054 
00055         _reader.setBlock( nextBlock );
00056         _reader.next();
00057         return true;
00058       }
00059 
00060       void _fetchDocument( int documentID, bool& endBlock ) {
00061         if( _numeric ) {
00062           _fetchDocumentNumeric( documentID, endBlock );
00063         } else {
00064           _fetchDocumentNonNumeric( documentID, endBlock );
00065         }
00066       }
00067 
00068       void _fetchDocumentNonNumeric( int documentID, bool& endBlock ) {
00069         endBlock = false;
00070 
00071         if( documentID != _reader.document() )
00072           return;
00073 
00074         _info.extents.push_back( _reader.extent() );
00075 
00076         while( _reader.next() ) {
00077           if( _reader.document() != documentID )
00078             return;
00079 
00080           _info.extents.push_back( _reader.extent() );
00081         }
00082 
00083         endBlock = true;
00084       }
00085 
00086       void _fetchDocumentNumeric( int documentID, bool& endBlock ) {
00087         endBlock = false;
00088 
00089         if( documentID != _reader.document() )
00090           return;
00091 
00092         _info.extents.push_back( _reader.extent() );
00093         _info.numbers.push_back( _reader.number() );
00094 
00095         while( _reader.next() ) {
00096           if( _reader.document() != documentID )
00097             return;
00098 
00099           _info.extents.push_back( _reader.extent() );
00100           _info.numbers.push_back( _reader.number() );
00101         }
00102 
00103         endBlock = true;
00104       }
00105 
00106     public:
00107       FieldListIterator( File& fieldListFile, bool numeric ) :
00108         _file( fieldListFile ),
00109         _readBuffer( fieldListFile, INDRI_FIELDLISTITERATOR_BUFFERSIZE, false ),
00110         _numeric(numeric)
00111       {
00112       }
00113 
00114       void startIteration() {
00115         _hasMore = true;
00116         _readBuffer.seekg( 0, std::fstream::beg );
00117         _info.documentID = -1;
00118         _finished = !_fetchNextBlock();
00119       }
00120 
00121       bool hasMore() {
00122         return _hasMore;
00123       }
00124 
00125       const FieldExtentInfo* currentEntry() {
00126         if( !_finished )
00127           return &_info;
00128         
00129         return 0;
00130       }
00131 
00132       const FieldExtentInfo* nextEntry() {
00133         if( _finished )
00134           return 0;
00135 
00136         if( !_hasMore ) {
00137           _finished = true;
00138           return 0;
00139         }
00140 
00141         // reader is pointing to next document ID already
00142         int documentID = _reader.document();
00143         _info.documentID = documentID;
00144         _info.extents.clear();
00145         _info.numbers.clear();
00146         bool atEndOfBlock;
00147 
00148         _fetchDocument( documentID, atEndOfBlock );
00149 
00150         while( atEndOfBlock ) {
00151           if( ! _fetchNextBlock() ) {
00152             _hasMore = false;
00153             return &_info;
00154           }
00155 
00156           _fetchDocument( documentID, atEndOfBlock );
00157         }
00158 
00159         return &_info;
00160       }
00161 
00162       const FieldExtentInfo* nextEntry( int documentID ) {
00163         if( _finished )
00164           return 0;
00165 
00166         if( documentID <= _info.documentID )
00167           return &_info;
00168 
00169         if( !_hasMore ) {
00170           _finished = true;
00171           return 0;
00172         }
00173 
00174         // we can skip blocks that only have documents that
00175         // we aren't interested in
00176         while( _reader.lastDocument() < documentID ) {
00177           if( ! _fetchNextBlock() ) {
00178             _finished = true;
00179             _hasMore = false;
00180             return 0;
00181           }
00182         }
00183 
00184         const FieldExtentInfo* info;
00185 
00186         // read each entry until we find one that matches, or until we're done
00187         for( info = nextEntry(); info && info->documentID < documentID; info = nextEntry() )
00188           ;
00189 
00190         return info;
00191       }
00192     };
00193   }
00194 }
00195 
00196 #endif // INDRI_FIELDLISTITERATOR_HPP

Generated on Wed Nov 3 12:58:55 2004 for Lemur Toolkit by doxygen1.2.18