DocListDiskBlockWriter.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // DocListDiskBlockWriter
00015 // 
00016 // 9 January 2004 - tds
00017 //
00018 
00019 #ifndef INDRI_DOCLISTDISKBLOCKWRITER_HPP
00020 #define INDRI_DOCLISTDISKBLOCKWRITER_HPP
00021 
00022 #include "indri/DocListDiskBlockReader.hpp"
00023 #include <indri/greedy_vector>
00024 
00025 // keep enough room for a new termID, new ending, new docID, new location count, new position
00026 #define INDRI_MAX_SINGLE_DOC_POSITION_SIZE (4+2+5*3)
00027 #define INDRI_MAX_POSITION_SIZE (5)
00028 
00029 namespace indri {
00030   namespace index {
00031     class DocListDiskBlockWriter {
00032       char _block[ INDRI_DOCLIST_BLOCKSIZE ];
00033       char* _data;
00034       char* _positionsSpot;
00035       int _positions;
00036       greedy_vector<UINT32> _termIDs;
00037       greedy_vector<UINT16> _endings;
00038 
00039       int _lastDocumentID;
00040       int _lastTermID;
00041       int _lastPosition;
00042     
00043     public:
00044       DocListDiskBlockWriter() {
00045         clear();
00046       }
00047 
00048     private:
00049       char* _beginMetadata() {
00050         size_t terms = _termIDs.size();
00051         size_t currentTermSize = terms * sizeof(UINT32);
00052         size_t currentEndingsSize = terms * sizeof(UINT16);
00053         size_t lastDocumentIDSize = sizeof(UINT32);
00054 
00055         return _block + INDRI_DOCLIST_BLOCKSIZE - (currentTermSize + currentEndingsSize + lastDocumentIDSize);
00056       }
00057 
00058       void _completeTerm() {
00059         // _addPositionsCount has to come before endings, because
00060         // _addPositionsCount may change the value of _data
00061         _addPositionsCount();
00062         _endings.push_back( (UINT16) (_data - _block) );
00063       }
00064 
00065       void _addPositionsCount() {
00066         assert( _positionsSpot );
00067         int compressedSize = RVLCompress::compressedSize( _positions );
00068 
00069         if( compressedSize > 1 ) {
00070           ::memmove( _positionsSpot + compressedSize, _positionsSpot + 1, _data - _positionsSpot - 1 );
00071           _data += compressedSize - 1;
00072         }
00073 
00074         RVLCompress::compress_int( _positionsSpot, _positions );
00075         _positions = 0;
00076         _positionsSpot = 0;
00077       }
00078 
00079       void _addTerm( int termID ) {
00080         if( _termIDs.size() ) {
00081           _completeTerm();
00082         }
00083 
00084         _termIDs.push_back( termID );
00085         _lastTermID = termID;
00086         _lastDocumentID = 0;
00087         _lastPosition = 0;
00088       }
00089 
00090       void _addDocument( int documentID ) {
00091         if( _positions ) {
00092           _addPositionsCount();
00093         }
00094 
00095         _data = RVLCompress::compress_int( _data, documentID - _lastDocumentID );
00096         _lastDocumentID = documentID;
00097         _lastPosition = 0;
00098 
00099         // leave one byte around for positions
00100         _positionsSpot = _data;
00101         _data += 1;
00102       }
00103 
00104       void _addPosition( int position ) {
00105         _data = RVLCompress::compress_int( _data, position - _lastPosition );
00106         _lastPosition = position;
00107         _positions++;
00108       }
00109 
00110     public:
00111       int addDocument( int termID, int documentID, const int* positions, int positionCount ) {
00112         int i;
00113 
00114         if( _beginMetadata() - _data < INDRI_MAX_SINGLE_DOC_POSITION_SIZE )
00115           return 0;
00116         
00117         if( termID != _lastTermID )
00118           _addTerm( termID );
00119       
00120         _addDocument( documentID );
00121 
00122         char* metadata = _beginMetadata();
00123 
00124         for( i=0; i<positionCount && metadata - _data > INDRI_MAX_POSITION_SIZE; i++ ) {
00125           _addPosition( positions[i] );
00126         }
00127         
00128         return i;
00129       }
00130 
00131       bool addPosition( int termID, int documentID, int position ) {
00132         if( _beginMetadata() - _data < INDRI_MAX_SINGLE_DOC_POSITION_SIZE )
00133           return false;
00134             
00135         if( termID != _lastTermID ) {
00136           _addTerm( termID );
00137         }
00138 
00139         if( documentID != _lastDocumentID ) {
00140           _addDocument( documentID );
00141         }
00142 
00143         _addPosition( position );
00144         return true;
00145       }
00146 
00147       void close() {
00148         //
00149         // This method adds all the metadata to the block and prepares
00150         // it for writing to disk.  The block format is as follows:
00151         //
00152         // Header:  2-byte termCount
00153         // Data:    Variable length section consisting of documentIDs and 
00154         //          locations, compressed
00155         // Void:    possibly some unused data space here
00156         // Footer:  4-byte last documentID
00157         //          (2 byte * termCount) endings -- the offsets into the
00158         //              data portion where the termID changes
00159         //          (4 byte * termCount) termIDs -- termIDs used in this
00160         //              block
00161 
00162         if( _positions ) {
00163           _completeTerm();
00164         }
00165 
00166         assert( _endings.size() == _termIDs.size() );
00167 
00168         // put a term count at the beginning of the block
00169         UINT16 termCount = UINT16(_termIDs.size());
00170         memcpy( _block, &termCount, sizeof(termCount) );
00171 
00172         char* endBlock = _block + sizeof(_block);
00173         char* startTermIDs = endBlock - sizeof(UINT32)*termCount;
00174         char* startEndings = startTermIDs - sizeof(UINT16)*termCount;
00175         char* startFinalDocID = startEndings - sizeof(UINT32);
00176 
00177         assert( startFinalDocID >= _data ); // term list data and end of block data shouldn't overlap
00178 
00179         // store the termID list at the end of the block
00180         memcpy( startTermIDs, &_termIDs.front(), termCount*sizeof(UINT32) );
00181 
00182         // store endings
00183         memcpy( startEndings, &_endings.front(), termCount*sizeof(UINT16) );
00184 
00185         // store final document ID
00186         memcpy( startFinalDocID, &_lastDocumentID, sizeof(UINT32) );
00187       }
00188 
00189       void clear() {
00190         _termIDs.clear();
00191         _endings.clear();
00192         _data = _block + sizeof(UINT16);
00193 
00194         _lastDocumentID = 0;
00195         _lastTermID = 0;
00196         _lastPosition = 0;
00197         _positions = 0;
00198         _positionsSpot = 0;
00199       }
00200 
00201       char* data() {
00202         return _block;
00203       }
00204 
00205       int dataSize() const {
00206         return INDRI_DOCLIST_BLOCKSIZE;
00207       }
00208     };
00209   }
00210 }
00211 
00212 #endif // INDRI_DOCLISTDISKBLOCKWRITER_HPP
00213