Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

Combiner.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // Combiner
00015 //
00016 // 3 June 2004 -- tds
00017 //
00018 
00019 
00020 #ifndef INDRI_COMBINER_HPP
00021 #define INDRI_COMBINER_HPP
00022 
00023 #include <iostream>
00024 #include <vector>
00025 #include <string>
00026 #include "indri/Buffer.hpp"
00027 #include "indri/HashTable.hpp"
00028 
00029 class Combiner {
00030 private:
00031   std::vector< std::ofstream* > _docBucketFiles;
00032   std::vector< std::ofstream* > _linkBucketFiles;
00033   std::vector< std::stringstream* > _docBuckets;
00034   std::vector< std::stringstream* > _linkBuckets;
00035   int _bins;
00036 
00037   struct strhash {
00038   public:
00039     int operator() ( const char* k ) const {
00040       int hash = 0;
00041       for( ; *k; k++ ){
00042         hash *= 7;
00043         hash += *k;
00044       }
00045       return hash;
00046     }
00047   };
00048 
00049   struct strcompst {
00050   public:
00051     int operator () ( const char* o, const char* t ) const {
00052       return strcmp( o, t );
00053     }
00054   };
00055 
00056   struct url_entry {
00057     char* url;
00058     char* corpusPath;
00059     char* docNo;
00060     int linkCount;
00061     Buffer linkinfo;
00062 
00063     void addLink( const char* linkDocUrl,
00064                   const char* linkText )
00065     {
00066       if( linkinfo.position() ) {
00067         // remove trailing 0
00068         linkinfo.unwrite(1);
00069       }
00070 
00071       int docUrlLen = strlen(linkDocUrl);
00072       int textLen = strlen(linkText);
00073 
00074       int total = docUrlLen + sizeof "LINKFROM=" +
00075                   textLen + sizeof "TEXT=" + 1;
00076 
00077       sprintf( linkinfo.write(total),
00078                "LINKFROM=%s\nTEXT=%s\n",
00079                linkDocUrl,
00080                linkText );
00081 
00082       linkCount++;
00083     }
00084   };
00085 
00086   typedef HashTable<char*, url_entry*, strhash, strcompst> UrlEntryTable;
00087   typedef HashTable<char*, std::vector<url_entry*>, strhash, strcompst> UrlEntryVectorTable;
00088 
00089   url_entry* _newUrlEntry( char* url, char* corpusPath, char* docNo );
00090   void _deleteUrlEntry( void* buffer );
00091   
00092   void _openWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, const std::string& path, int bins );
00093   void _flushWriteBuffer( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force, int i );
00094   void _flushWriteBuffers( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets, bool force );
00095   void _closeWriteBuckets( std::vector<std::stringstream*>& buffers, std::vector<std::ofstream*>& buckets );
00096   void _openReadBuckets( std::vector<std::ifstream*>& buckets, const std::string& path, int bins );
00097   void _readDocBucket( UrlEntryTable& urlTable, std::ifstream& docIn );
00098 
00099   int hashString( const char* str );
00100   void hashToBuckets( std::ifstream& in, const std::string& path );
00101   void createBuckets( const std::string& tmpPath );
00102   void closeBuckets();
00103   void combineBucket( const std::string& outputPath, const std::string& tmpPath, int bucket );
00104   void hashToBuckets( const std::string& inputPath );
00105   void combineRedirectDestinationBucket( const std::string& tmpPath, int i, std::vector<std::stringstream*>& outBuffers, std::vector<std::ofstream*>& outputFiles );
00106 
00107 public:
00108   Combiner( int bins = 10 ) : _bins(bins) {}
00109 
00110   void combineRedirectDestinationBuckets( const std::string& tmpPath );
00111   void combineBuckets( const std::string& outputPath, const std::string& tmpPath );
00112   void hashRedirectTargets( const std::string& bucketPath, const std::string& redirectsPath );
00113   void hashToBuckets( const std::string& bucketPath, const std::string& inputPath );
00114   void sortCorpusFiles( const std::string& outputPath, const std::string& preSortPath, const std::string& inputPath );
00115 };
00116 
00117 #endif // INDRI_COMBINER_HPP
00118 

Generated on Wed Nov 3 12:58:52 2004 for Lemur Toolkit by doxygen1.2.18