AnchorTextAnnotator.hpp Source File

00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // AnchorTextAnnotator
00015 //
00016 // 25 May 2004 -- tds
00017 //
00018 // Reads anchor text in from files created by the 
00019 // combiner, and adds the text to the end of the
00020 // parsed document
00021 //
00022 
00023 #ifndef INDRI_ANCHORTEXTANNOTATOR_HPP
00024 #define INDRI_ANCHORTEXTANNOTATOR_HPP
00025 
00026 #include "indri/Buffer.hpp"
00027 #include "indri/Transformation.hpp"
00028 #include <iostream>
00029 #include "indri/TagExtent.hpp"
00030 #include "indri/ParsedDocument.hpp"
00031 #include <fstream>
00032 
00033 class AnchorTextAnnotator : public Transformation {
00034   std::ifstream _in;
00035   char _docno[256];
00036   int _count;
00037   Buffer _buffer;
00038   ObjectHandler<ParsedDocument>* _handler;
00039 
00040   void _readDocumentHeader() {
00041     char line[65536];
00042 
00043     if( !_in.good() || _in.eof() )
00044       return;
00045 
00046     // DOCNO=
00047     _in.getline( _docno, sizeof _docno-1 );
00048     // DOCURL=
00049     _in.getline( line, sizeof line-1 );
00050 
00051     // LINKS=
00052     _in.getline( line, sizeof line-1 );
00053     _count = atoi( line+6 );
00054   }
00055 
00056   void _fetchText( greedy_vector<TagExtent>& tags, greedy_vector<char*>& terms ) {
00057     // first, surround current text with a mainbody tag
00058     TagExtent mainbody;
00059     mainbody.begin = 0;
00060     mainbody.end = terms.size();
00061     mainbody.name = "mainbody";
00062     mainbody.number = 0;
00063 
00064     greedy_vector<TagExtent> oldTags;
00065     oldTags = tags;
00066     tags.clear();
00067     tags.push_back( mainbody );
00068     tags.append( oldTags.begin(), oldTags.end() );
00069 
00070     // now, fetch the additional terms
00071     char line[65536];
00072     _buffer.clear();
00073 
00074     for( int i=0; i<_count; i++ ) {
00075       // LINK
00076       _in.getline( line, sizeof line-1 );
00077 
00078       // TEXT=
00079       _in.getline( line, sizeof line-1 );
00080       int textLen = strlen(line+6);
00081       strcpy( _buffer.write(textLen+1), line+6 );
00082       _buffer.unwrite(1);
00083 
00084       assert( *(_buffer.front()+_buffer.position()-1) == '\"' && "Last character should be a quote" );
00085     }
00086     *(_buffer.write(1)) = 0;
00087 
00088     // now there's a bunch of text in _buffer, space separated, with each
00089     // link separated by a " symbol
00090 
00091     char* beginWord = 0;
00092     int beginIndex = 0;
00093     char* buffer = _buffer.front();
00094 
00095     for( unsigned int i=0; i<_buffer.position(); i++ ) {
00096       if( isalnum(buffer[i]) && !beginWord ) {
00097         beginWord = buffer+i;
00098 
00099         if(!beginIndex)
00100           beginIndex = terms.size();
00101       } else if( isspace(buffer[i]) ) {
00102         if( beginWord )
00103           terms.push_back( beginWord );
00104         buffer[i] = 0;
00105         beginWord = 0;
00106       } else if( buffer[i] == '\"' ) {
00107         buffer[i] = 0;
00108         if( beginWord )
00109           terms.push_back( beginWord );
00110         beginWord = 0;
00111         
00112         TagExtent extent;
00113         extent.name = "inlink";
00114         extent.begin = beginIndex;
00115         extent.end = terms.size();
00116         extent.number = 0;
00117 
00118         assert( extent.begin <= extent.end );
00119 
00120         if( beginIndex )
00121           tags.push_back(extent);
00122 
00123         beginIndex = 0;
00124       }
00125     }
00126   }
00127 
00128   bool _matchingDocno( ParsedDocument* document ) {
00129     // find DOCNO attribute in document
00130     for( size_t i=0; i<document->metadata.size(); i++ ) {
00131       const char* attributeName = document->metadata[i].key;
00132       const char* attributeValue = (const char*) document->metadata[i].value;
00133 
00134       if( !strcmp( attributeName, "docno" ) ) {
00135         if( !strcmp( attributeValue, _docno+6 ) ) {
00136           return true;
00137         } else {
00138           return false;
00139         }
00140       }
00141     }
00142  
00143     return false;
00144   }
00145 
00146 public:
00147   AnchorTextAnnotator() {
00148     _handler = 0;
00149   }
00150 
00151   ~AnchorTextAnnotator() {
00152     _in.close();
00153   }
00154 
00155   void open( const std::string& anchorFile ) {
00156     _in.close();
00157     _in.clear();
00158     _in.open( anchorFile.c_str() );
00159     _buffer.clear();
00160     _readDocumentHeader();
00161   }
00162 
00163   ParsedDocument* transform( ParsedDocument* document ) {
00164     if( _matchingDocno( document ) ) {
00165       _fetchText( document->tags, document->terms );
00166       _readDocumentHeader();
00167     }
00168   
00169     return document;
00170   }
00171 
00172   void setHandler( ObjectHandler<ParsedDocument>& handler ) {
00173     _handler = &handler;
00174   }
00175 
00176   void handle( ParsedDocument* document ) {
00177     _handler->handle( transform( document ) );
00178   }
00179 };
00180 
00181 #endif // INDRI_ANCHORTEXTANNOTATOR_HPP
00182