00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef INDRI_COMBINER_HPP
00021 #define INDRI_COMBINER_HPP
00022
00023 #include <iostream>
00024 #include <vector>
00025 #include <string>
00026 #include "indri/Buffer.hpp"
00027 #include "indri/HashTable.hpp"
00028
00029 class Combiner {
00030 private:
00031 std::vector< std::ofstream* > _docBucketFiles;
00032 std::vector< std::ofstream* > _linkBucketFiles;
00033 std::vector< std::stringstream* > _docBuckets;
00034 std::vector< std::stringstream* > _linkBuckets;
00035 int _bins;
00036
00037 struct strhash {
00038 public:
00039 int operator() ( const char* k ) const {
00040 int hash = 0;
00041 for( ; *k; k++ ){
00042 hash *= 7;
00043 hash += *k;
00044 }
00045 return hash;
00046 }
00047 };
00048
00049 struct strcompst {
00050 public:
00051 int operator () ( const char* o, const char* t ) const {
00052 return strcmp( o, t );
00053 }
00054 };
00055
00056 struct url_entry {
00057 char* url;
00058 char* corpusPath;
00059 char* docNo;
00060 int linkCount;
00061 Buffer linkinfo;
00062
00063 void addLink( const char* linkDocUrl,
00064 const char* linkText )
00065 {
00066 if( linkinfo.position() ) {
00067
00068 linkinfo.unwrite(1);
00069 }
00070
00071 int docUrlLen = strlen(linkDocUrl);
00072 int textLen = strlen(linkText);
00073
00074 int total = docUrlLen + sizeof "LINKFROM=" +
00075 textLen + sizeof "TEXT=" + 1;
00076
00077 sprintf( linkinfo.write(total),
00078 "LINKFROM=%s\nTEXT=%s\n",
00079 linkDocUrl,
00080 linkText );
00081
00082 linkCount++;
00083 }
00084 };
00085
00086 typedef HashTable<char*, url_entry*, strhash, strcompst> UrlEntryTable;
00087 typedef HashTable<char*, std::vector<url_entry*>, strhash, strcompst> UrlEntryVectorTable;
00088
00089 url_entry* _newUrlEntry( char* url, char* corpusPath, char* docNo );
00090 void _deleteUrlEntry( void* buffer );
00091
00092 void _openWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, const std::string& path, int bins );
00093 void _flushWriteBuffer( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force, int i );
00094 void _flushWriteBuffers( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force );
00095 void _closeWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets );
00096 void _openReadBuckets( std::vector<std::ifstream*>& buckets, const std::string& path, int bins );
00097 void _readDocBucket( UrlEntryTable& urlTable, std::ifstream& docIn );
00098
00099 int hashString( const char* str );
00100 void hashToBuckets( std::ifstream& in, const std::string& path );
00101 void createBuckets( const std::string& tmpPath );
00102 void closeBuckets();
00103 void combineBucket( const std::string& outputPath, const std::string& tmpPath, int bucket );
00104 void hashToBuckets( const std::string& inputPath );
00105 void combineRedirectDestinationBucket( const std::string& tmpPath, int i, std::vector<std::stringstream*>& outBuffers, std::vector<std::ofstream*>& outputFiles );
00106
00107 public:
00108 Combiner( int bins = 10 ) : _bins(bins) {}
00109
00110 void combineRedirectDestinationBuckets( const std::string& tmpPath );
00111 void combineBuckets( const std::string& outputPath, const std::string& tmpPath );
00112 void hashRedirectTargets( const std::string& bucketPath, const std::string& redirectsPath );
00113 void hashToBuckets( const std::string& bucketPath, const std::string& inputPath );
00114 void sortCorpusFiles( const std::string& outputPath, const std::string& preSortPath, const std::string& inputPath );
00115 };
00116
00117 #endif // INDRI_COMBINER_HPP
00118