Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

TagList.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // TagList
00015 //
00016 // March 2004 -- metzler
00017 //
00018 
00019 #include "indri/Tag.hpp"
00020 #include <stdio.h>
00021 #include <string.h>
00022 #include <indri/greedy_vector>
00023 #include "indri/TagExtent.hpp"
00024 #include <iostream>
00025 #include "indri/MetadataPair.hpp"
00026 #include "indri/Buffer.hpp"
00027 
00028 #ifndef _TAGLIST_HPP
00029 #define _TAGLIST_HPP
00030 
00031 class TagList {
00032 private:
00033   struct tag_entry {
00034     const char* name;
00035     const char* conflation;
00036     int next;
00037     int begin;
00038     int end;
00039   };
00040 
00041   greedy_vector<tag_entry> _tags;
00042   int _openList;
00043 
00044 public:
00045   TagList() {
00046     clear();
00047   }
00048 
00049   void clear() {
00050     _tags.clear();
00051     _openList = -1;
00052   }
00053 
00054   // we assume here that name is more or less immutable
00055   // so we can store a pointer to it.  This is a reasonable
00056   // assumption, because if the tag is indexed, its name is
00057   // in a hash table somewhere, and we can just point to that
00058   // name copy.
00059   void addTag(const char *name, const char* conflation, int begin) {
00060     // because of conflations, all kinds of messy stuff
00061     // happens if there's already an open tag with the same
00062     // conflation as this one.  Therefore, we have to go looking
00063     // for all open tags with this conflation; if there are any,
00064     // this tag doesn't get added.
00065     int list = _openList;
00066 
00067     while( list >= 0 ) {
00068       tag_entry& entry = _tags[list];
00069 
00070       if( !strcmp( entry.conflation, conflation ) ) {
00071         // we already have one of these
00072         return;
00073       }
00074    
00075       list = entry.next;
00076     }
00077 
00078     // all clear now to add the tag:
00079     tag_entry t;
00080     t.name = name;
00081     t.conflation = conflation;
00082     t.begin = begin;
00083     t.end = -1;
00084     t.next = _openList;
00085     _tags.push_back(t);
00086     _openList = _tags.size()-1;
00087   }
00088 
00089   void endTag(const char *name, const char* conflation, int end) {
00090     int list = _openList;
00091     int prev = -1;
00092 
00093     while( list >= 0 ) {
00094       tag_entry& entry = _tags[list];
00095 
00096       if( !strcmp( entry.name, name ) ) {
00097         // found a tag to close
00098         entry.end = end;
00099         int next = entry.next;
00100 
00101         // unlink from open list
00102         if( prev == -1 ) {
00103           _openList = next;
00104         } else {
00105           _tags[prev].next = next;
00106         }
00107         
00108         return;
00109       } else {
00110         // this wasn't the tag, so keep looking
00111         prev = list;
00112         list = entry.next;
00113       }
00114     }
00115   }
00116 
00117   void writeTagList( greedy_vector<TagExtent>& tags ) {
00118     // look through the tags vector; they're already in sorted order by open
00119     // position.  Only add closed tags.
00120 
00121     for( unsigned int i=0; i<_tags.size(); i++ ) {
00122       tag_entry& entry = _tags[i];
00123 
00124       if( entry.end > 0 ) {
00125         TagExtent extent;
00126         extent.begin = entry.begin;
00127         extent.end = entry.end;
00128         extent.name = entry.conflation;
00129         extent.number = 0;
00130         tags.push_back(extent);
00131       }
00132     }
00133   }
00134 
00135   // in this case, we'll treat the list of tags in this list
00136   // as if they were offsets into a metadata list
00137   void writeMetadataList( greedy_vector<MetadataPair>& pairs, Buffer& buffer, const char* docText ) {
00138     for( unsigned int i=0; i<_tags.size(); i++ ) {
00139       tag_entry& entry = _tags[i];
00140 
00141       if( entry.end > 0 ) {
00142         MetadataPair pair;
00143         
00144         // copy the text into a buffer
00145         int length = entry.end - entry.begin;
00146         char* spot = buffer.write(length+1);
00147         strncpy( spot, docText + entry.begin, length);
00148         spot[length] = 0;
00149 
00150         pair.key = entry.conflation;
00151         pair.value = spot;
00152         pair.valueLength = length+1;
00153 
00154         // docno is special -- its value must be stripped
00155         if( !strcmp( pair.key, "docno" ) ) {
00156           pair.stripValue();
00157         }
00158 
00159         pairs.push_back(pair);
00160       }
00161     }
00162   }
00163 
00164 };
00165 
00166 #endif

Generated on Wed Nov 3 12:59:05 2004 for Lemur Toolkit by doxygen1.2.18