00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP
00020 #define LEMUR_KEYFILEDOCLISTMEMORYBUILDER_HPP
00021
00022 #include "RVLCompress.hpp"
00023 #include <vector>
00024 #include <assert.h>
00025 #include "indri/greedy_vector"
00026
00027 namespace indri {
00028 namespace index {
00029 class DocListMemoryBuilderIterator {
00030 const greedy_vector< std::pair<char*, char*>, 4 >& _lists;
00031 greedy_vector< std::pair<char*, char*>, 4 >::const_iterator _current;
00032
00033 const char* _list;
00034 const char* _listEnd;
00035
00036 int _currentDocument;
00037 int _currentPosition;
00038 int _positionsLeft;
00039
00040 public:
00041 DocListMemoryBuilderIterator( const greedy_vector< std::pair<char*,char*>, 4 >& lists ) :
00042 _lists(lists)
00043 {
00044 _current = _lists.begin();
00045 _currentDocument = 0;
00046 _currentPosition = 0;
00047 _positionsLeft = 0;
00048 _list = 0;
00049 _listEnd = 0;
00050
00051 if( _current != _lists.end() ) {
00052 _list = _current->first;
00053 _listEnd = _current->second;
00054 }
00055 }
00056
00057 bool next() {
00058 if( _list < _listEnd ) {
00059 if( _positionsLeft > 0 ) {
00060
00061 int deltaPosition;
00062 _list = RVLCompress::decompress_int( _list, deltaPosition );
00063 _currentPosition += deltaPosition;
00064 _positionsLeft--;
00065 return true;
00066 } else {
00067
00068 int deltaDocument;
00069 _list = RVLCompress::decompress_int( _list, deltaDocument );
00070 _list = RVLCompress::decompress_int( _list, _positionsLeft );
00071 _list = RVLCompress::decompress_int( _list, _currentPosition );
00072 _currentDocument += deltaDocument;
00073 _positionsLeft--;
00074 return true;
00075 }
00076 } else {
00077 assert( _list == _listEnd );
00078
00079
00080 if( _current != _lists.end() )
00081 _current++;
00082
00083 if( _current != _lists.end() ) {
00084 _list = _current->first;
00085 _listEnd = _current->second;
00086 return next();
00087 }
00088
00089
00090 return false;
00091 }
00092 }
00093
00094 int document() {
00095 return _currentDocument;
00096 }
00097
00098 int position() {
00099 return _currentPosition;
00100 }
00101 };
00102
00103 class DocListMemoryBuilder {
00104 public:
00105 typedef DocListMemoryBuilderIterator iterator;
00106
00107 private:
00108 int _documentFrequency;
00109 int _termFrequency;
00110
00111 greedy_vector< std::pair<char*,char*>, 4 > _lists;
00112
00113 char* _list;
00114 char* _listBegin;
00115 char* _listEnd;
00116 char* _locationCountPointer;
00117
00118 int _lastLocation;
00119 int _lastDocument;
00120 int _lastTermFrequency;
00121
00122 void _storeCompressedInt( std::vector<char>& destination, int data, int previous = 0 );
00123 void _createDocument( int docID );
00124 void _writeLocation( int location );
00125 void _terminateDocument();
00126 void _terminateSegment();
00127 void _grow();
00128 void _copy( DocListMemoryBuilder& other );
00129
00130 public:
00131 DocListMemoryBuilder();
00132 const DocListMemoryBuilder& operator=( DocListMemoryBuilder& other );
00133 void addLocation( int docID, int location );
00134 void clear();
00135 void close();
00136 iterator getIterator();
00137 bool empty();
00138
00139 int documentFrequency() const;
00140 int termFrequency() const;
00141 size_t memorySize() const;
00142 int curDocID() const;
00143 };
00144 }
00145 }
00146
00147 #endif // LEMUR_DOCLISTMEMORYBUILDER_HPP
00148