00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef INDRI_ANCHORTEXTANNOTATOR_HPP
00024 #define INDRI_ANCHORTEXTANNOTATOR_HPP
00025
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include <iostream>
00029 #include "indri/TagExtent.hpp"
00030 #include "indri/ParsedDocument.hpp"
00031 #include <fstream>
00032
00033 class AnchorTextAnnotator : public Transformation {
00034 std::ifstream _in;
00035 char _docno[256];
00036 int _count;
00037 Buffer _buffer;
00038 ObjectHandler<ParsedDocument>* _handler;
00039
00040 void _readDocumentHeader() {
00041 char line[65536];
00042
00043 if( !_in.good() || _in.eof() )
00044 return;
00045
00046
00047 _in.getline( _docno, sizeof _docno-1 );
00048
00049 _in.getline( line, sizeof line-1 );
00050
00051
00052 _in.getline( line, sizeof line-1 );
00053 _count = atoi( line+6 );
00054 }
00055
00056 void _fetchText( greedy_vector<TagExtent>& tags, greedy_vector<char*>& terms ) {
00057
00058 TagExtent mainbody;
00059 mainbody.begin = 0;
00060 mainbody.end = terms.size();
00061 mainbody.name = "mainbody";
00062 mainbody.number = 0;
00063
00064 greedy_vector<TagExtent> oldTags;
00065 oldTags = tags;
00066 tags.clear();
00067 tags.push_back( mainbody );
00068 tags.append( oldTags.begin(), oldTags.end() );
00069
00070
00071 char line[65536];
00072 _buffer.clear();
00073
00074 for( int i=0; i<_count; i++ ) {
00075
00076 _in.getline( line, sizeof line-1 );
00077
00078
00079 _in.getline( line, sizeof line-1 );
00080 int textLen = strlen(line+6);
00081 strcpy( _buffer.write(textLen+1), line+6 );
00082 _buffer.unwrite(1);
00083
00084 assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
00085 }
00086 *(_buffer.write(1)) = 0;
00087
00088
00089
00090
00091 char* beginWord = 0;
00092 int beginIndex = 0;
00093 char* buffer = _buffer.front();
00094
00095 for( unsigned int i=0; i<_buffer.position(); i++ ) {
00096 if( isalnum(buffer[i]) && !beginWord ) {
00097 beginWord = buffer+i;
00098
00099 if(!beginIndex)
00100 beginIndex = terms.size();
00101 } else if( isspace(buffer[i]) ) {
00102 if( beginWord )
00103 terms.push_back( beginWord );
00104 buffer[i] = 0;
00105 beginWord = 0;
00106 } else if( buffer[i] == '\"' ) {
00107 buffer[i] = 0;
00108 if( beginWord )
00109 terms.push_back( beginWord );
00110 beginWord = 0;
00111
00112 TagExtent extent;
00113 extent.name = "inlink";
00114 extent.begin = beginIndex;
00115 extent.end = terms.size();
00116 extent.number = 0;
00117
00118 assert( extent.begin <= extent.end );
00119
00120 if( beginIndex )
00121 tags.push_back(extent);
00122
00123 beginIndex = 0;
00124 }
00125 }
00126 }
00127
00128 bool _matchingDocno( ParsedDocument* document ) {
00129
00130 for( size_t i=0; i<document->metadata.size(); i++ ) {
00131 const char* attributeName = document->metadata[i].key;
00132 const char* attributeValue = (const char*) document->metadata[i].value;
00133
00134 if( !strcmp( attributeName, "docno" ) ) {
00135 if( !strcmp( attributeValue, _docno+6 ) ) {
00136 return true;
00137 } else {
00138 return false;
00139 }
00140 }
00141 }
00142
00143 return false;
00144 }
00145
00146 public:
00147 AnchorTextAnnotator() {
00148 _handler = 0;
00149 }
00150
00151 ~AnchorTextAnnotator() {
00152 _in.close();
00153 }
00154
00155 void open( const std::string& anchorFile ) {
00156 _in.close();
00157 _in.clear();
00158 _in.open( anchorFile.c_str() );
00159 _buffer.clear();
00160 _readDocumentHeader();
00161 }
00162
00163 ParsedDocument* transform( ParsedDocument* document ) {
00164 if( _matchingDocno( document ) ) {
00165 _fetchText( document->tags, document->terms );
00166 _readDocumentHeader();
00167 }
00168
00169 return document;
00170 }
00171
00172 void setHandler( ObjectHandler<ParsedDocument>& handler ) {
00173 _handler = &handler;
00174 }
00175
00176 void handle( ParsedDocument* document ) {
00177 _handler->handle( transform( document ) );
00178 }
00179 };
00180
00181 #endif // INDRI_ANCHORTEXTANNOTATOR_HPP
00182