00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_ANCHORTEXTWRITER_HPP
00020 #define INDRI_ANCHORTEXTWRITER_HPP
00021
00022 #include <iostream>
00023 #include <algorithm>
00024 #include "indri/Path.hpp"
00025 #include "lemur-compat.hpp"
00026
00027 class AnchorTextWriter : public ObjectHandler<ParsedDocument> {
00028 private:
00029 std::ofstream _out;
00030
00031 public:
00032 AnchorTextWriter( const std::string& outputPath ) {
00033 std::string directory = Path::directory( outputPath );
00034 Path::make( directory );
00035 _out.open( outputPath.c_str(), std::ios::out );
00036 }
00037
00038 ~AnchorTextWriter() {
00039 _out.close();
00040 }
00041
00042 void handle( ParsedDocument* document ) {
00043 greedy_vector<MetadataPair>::iterator iter;
00044
00045 iter = std::find_if( document->metadata.begin(),
00046 document->metadata.end(),
00047 MetadataPair::key_equal( "DOCNO" ) );
00048
00049 const char* docno = (char*)iter->value;
00050
00051 iter = std::find_if( document->metadata.begin(),
00052 document->metadata.end(),
00053 MetadataPair::key_equal( "URL" ) );
00054
00055 const char* page = (char*)iter->value;
00056 const char* url = 0;
00057 int count = 0;
00058 int urlEnd = -1;
00059
00060
00061
00062 char* slash = 0;
00063 if(page) slash = strchr( page, '/' );
00064 if(slash) slash = strchr( slash+1, '/' );
00065 if(slash) slash = strchr( slash+1, '/' );
00066
00067 int domainLength;
00068 if( slash )
00069 domainLength = slash - page;
00070 else
00071 domainLength = strlen(page);
00072
00073
00074 for( unsigned int i=0; i<document->tags.size(); i++ ) {
00075 TagExtent& extent = document->tags[i];
00076
00077
00078 if( !strcmp( extent.name, "absolute-url" ) ) {
00079 url = document->terms[ extent.begin ];
00080 urlEnd = extent.end;
00081
00082
00083 if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00084 url = 0;
00085 urlEnd = -1;
00086 }
00087 } else if( !strcmp( extent.name, "a" ) &&
00088 url &&
00089 urlEnd == extent.begin &&
00090 extent.end - extent.begin > 0 )
00091 {
00092 count++;
00093 url = 0;
00094 }
00095 }
00096
00097
00098 _out << "DOCNO=" << docno << std::endl;
00099 _out << "DOCURL=" << page << std::endl;
00100 _out << "LINKS=" << count << std::endl;
00101 url = 0;
00102 urlEnd = -1;
00103
00104 for( unsigned int i=0; i<document->tags.size(); i++ ) {
00105 TagExtent& extent = document->tags[i];
00106
00107 if( !strcmp( extent.name, "absolute-url" ) ) {
00108 url = document->terms[ extent.begin ];
00109 urlEnd = extent.end;
00110
00111
00112 if( url && page && !lemur_compat::strncasecmp( url, page, domainLength ) ) {
00113 url = 0;
00114 urlEnd = -1;
00115 }
00116 } else if( !strcmp( extent.name, "a" ) &&
00117 url &&
00118 urlEnd == extent.begin &&
00119 extent.end - extent.begin > 0 )
00120 {
00121 int textLength = 0;
00122
00123 _out << "LINKURL=" << url << std::endl;
00124 _out << "TEXT=\"";
00125 for( unsigned int j=extent.begin; j < extent.end && textLength < 60000; j++ ) {
00126 if( !document->terms[j] )
00127 continue;
00128
00129 textLength += strlen(document->terms[j])+1;
00130 _out << document->terms[j] << " ";
00131 }
00132 _out << "\"" << std::endl;
00133
00134
00135 url = 0;
00136 }
00137 }
00138 }
00139 };
00140
00141 #endif // INDRI_ANCHORTEXTWRITER_HPP
00142