00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_DOCLISTDISKBLOCKWRITER_HPP
00020 #define INDRI_DOCLISTDISKBLOCKWRITER_HPP
00021
00022 #include "indri/DocListDiskBlockReader.hpp"
00023 #include <indri/greedy_vector>
00024
00025
00026 #define INDRI_MAX_SINGLE_DOC_POSITION_SIZE (4+2+5*3)
00027 #define INDRI_MAX_POSITION_SIZE (5)
00028
00029 namespace indri {
00030 namespace index {
00031 class DocListDiskBlockWriter {
00032 char _block[ INDRI_DOCLIST_BLOCKSIZE ];
00033 char* _data;
00034 char* _positionsSpot;
00035 int _positions;
00036 greedy_vector<UINT32> _termIDs;
00037 greedy_vector<UINT16> _endings;
00038
00039 int _lastDocumentID;
00040 int _lastTermID;
00041 int _lastPosition;
00042
00043 public:
00044 DocListDiskBlockWriter() {
00045 clear();
00046 }
00047
00048 private:
00049 char* _beginMetadata() {
00050 size_t terms = _termIDs.size();
00051 size_t currentTermSize = terms * sizeof(UINT32);
00052 size_t currentEndingsSize = terms * sizeof(UINT16);
00053 size_t lastDocumentIDSize = sizeof(UINT32);
00054
00055 return _block + INDRI_DOCLIST_BLOCKSIZE - (currentTermSize + currentEndingsSize + lastDocumentIDSize);
00056 }
00057
00058 void _completeTerm() {
00059
00060
00061 _addPositionsCount();
00062 _endings.push_back( (UINT16) (_data - _block) );
00063 }
00064
00065 void _addPositionsCount() {
00066 assert( _positionsSpot );
00067 int compressedSize = RVLCompress::compressedSize( _positions );
00068
00069 if( compressedSize > 1 ) {
00070 ::memmove( _positionsSpot + compressedSize, _positionsSpot + 1, _data - _positionsSpot - 1 );
00071 _data += compressedSize - 1;
00072 }
00073
00074 RVLCompress::compress_int( _positionsSpot, _positions );
00075 _positions = 0;
00076 _positionsSpot = 0;
00077 }
00078
00079 void _addTerm( int termID ) {
00080 if( _termIDs.size() ) {
00081 _completeTerm();
00082 }
00083
00084 _termIDs.push_back( termID );
00085 _lastTermID = termID;
00086 _lastDocumentID = 0;
00087 _lastPosition = 0;
00088 }
00089
00090 void _addDocument( int documentID ) {
00091 if( _positions ) {
00092 _addPositionsCount();
00093 }
00094
00095 _data = RVLCompress::compress_int( _data, documentID - _lastDocumentID );
00096 _lastDocumentID = documentID;
00097 _lastPosition = 0;
00098
00099
00100 _positionsSpot = _data;
00101 _data += 1;
00102 }
00103
00104 void _addPosition( int position ) {
00105 _data = RVLCompress::compress_int( _data, position - _lastPosition );
00106 _lastPosition = position;
00107 _positions++;
00108 }
00109
00110 public:
00111 int addDocument( int termID, int documentID, const int* positions, int positionCount ) {
00112 int i;
00113
00114 if( _beginMetadata() - _data < INDRI_MAX_SINGLE_DOC_POSITION_SIZE )
00115 return 0;
00116
00117 if( termID != _lastTermID )
00118 _addTerm( termID );
00119
00120 _addDocument( documentID );
00121
00122 char* metadata = _beginMetadata();
00123
00124 for( i=0; i<positionCount && metadata - _data > INDRI_MAX_POSITION_SIZE; i++ ) {
00125 _addPosition( positions[i] );
00126 }
00127
00128 return i;
00129 }
00130
00131 bool addPosition( int termID, int documentID, int position ) {
00132 if( _beginMetadata() - _data < INDRI_MAX_SINGLE_DOC_POSITION_SIZE )
00133 return false;
00134
00135 if( termID != _lastTermID ) {
00136 _addTerm( termID );
00137 }
00138
00139 if( documentID != _lastDocumentID ) {
00140 _addDocument( documentID );
00141 }
00142
00143 _addPosition( position );
00144 return true;
00145 }
00146
00147 void close() {
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162 if( _positions ) {
00163 _completeTerm();
00164 }
00165
00166 assert( _endings.size() == _termIDs.size() );
00167
00168
00169 UINT16 termCount = UINT16(_termIDs.size());
00170 memcpy( _block, &termCount, sizeof(termCount) );
00171
00172 char* endBlock = _block + sizeof(_block);
00173 char* startTermIDs = endBlock - sizeof(UINT32)*termCount;
00174 char* startEndings = startTermIDs - sizeof(UINT16)*termCount;
00175 char* startFinalDocID = startEndings - sizeof(UINT32);
00176
00177 assert( startFinalDocID >= _data );
00178
00179
00180 memcpy( startTermIDs, &_termIDs.front(), termCount*sizeof(UINT32) );
00181
00182
00183 memcpy( startEndings, &_endings.front(), termCount*sizeof(UINT16) );
00184
00185
00186 memcpy( startFinalDocID, &_lastDocumentID, sizeof(UINT32) );
00187 }
00188
00189 void clear() {
00190 _termIDs.clear();
00191 _endings.clear();
00192 _data = _block + sizeof(UINT16);
00193
00194 _lastDocumentID = 0;
00195 _lastTermID = 0;
00196 _lastPosition = 0;
00197 _positions = 0;
00198 _positionsSpot = 0;
00199 }
00200
00201 char* data() {
00202 return _block;
00203 }
00204
00205 int dataSize() const {
00206 return INDRI_DOCLIST_BLOCKSIZE;
00207 }
00208 };
00209 }
00210 }
00211
00212 #endif // INDRI_DOCLISTDISKBLOCKWRITER_HPP
00213