00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_TAGGEDTEXTPARSER_HPP
00020 #define INDRI_TAGGEDTEXTPARSER_HPP
00021
00022 #include <stdio.h>
00023 #include <ctype.h>
00024 #include <string.h>
00025 #include <string>
00026 #include <vector>
00027 #include <map>
00028 #include "indri/HashTable.hpp"
00029 #include "indri/TagList.hpp"
00030 #include "indri/IndriParser.hpp"
00031 #include "indri/Buffer.hpp"
00032 #include "string-set.h"
00033
00034 #define MAX_DOCNO_LENGTH 128
00035 #define PARSER_MAX_BUF_SIZE 1024
00036
00037 class StringHash {
00038 public:
00039 int operator() (const std::string key) const {
00040 int hash = 0;
00041 for(unsigned int i = 0; i < key.length(); i++)
00042 hash += (unsigned char)key[i];
00043 return hash;
00044 }
00045 };
00046
00047 class StringComparator {
00048 public:
00049 int operator() (const std::string one, const std::string two) const {
00050 return one.compare(two);
00051 }
00052 };
00053
00054 class TaggedTextParser : public indri::Parser {
00055 public:
00056 TaggedTextParser();
00057 ~TaggedTextParser();
00058
00059 void setTags( const std::vector<std::string>& include,
00060 const std::vector<std::string>& exclude,
00061 const std::vector<std::string>& index,
00062 const std::vector<std::string>& metadata,
00063 const std::map<std::string,std::string>& conflations );
00064
00065 ParsedDocument* parse( UnparsedDocument* document );
00066
00067 void handle( UnparsedDocument* document );
00068 void setHandler( ObjectHandler<ParsedDocument>& h );
00069
00070 protected:
00071 typedef HashTable<std::string, std::string, StringHash, StringComparator> StrHashTable;
00072
00073 virtual void handleToken(char *token, int type, long pos);
00074 virtual void initialize( UnparsedDocument* unparsed, ParsedDocument* parsed );
00075 virtual void cleanup( UnparsedDocument* unparsed, ParsedDocument* parsed );
00076
00077 void addTag(const char *s, const char* c, int pos) { tl->addTag(s, c, pos); }
00078 void endTag(const char *s, const char* c, int pos) { tl->endTag(s, c, pos); }
00079
00080 void addMetadataTag(const char* s, const char* c, int pos) { _metaList->addTag(s, c, pos); }
00081 void endMetadataTag(const char* s, const char* c, int pos) { _metaList->endTag(s, c, pos); }
00082
00083
00084 TagList* tl;
00085 TagList* _metaList;
00086 Buffer _termBuffer;
00087
00088 void writeToken(char *token);
00089
00090 struct tag_properties {
00091 const char* name;
00092 const char* conflation;
00093 bool index;
00094 bool exclude;
00095 bool include;
00096 bool metadata;
00097 };
00098 tag_properties* _findTag(const char* name);
00099 tag_properties* _buildTag( const std::string& name, const std::map<std::string,std::string>& conflations );
00100 HashTable<const char*, tag_properties*> _tagTable;
00101 void handleTag(char* token, long pos);
00102
00103 const tag_properties* _startExcludeRegion;
00104 const tag_properties* _startIncludeRegion;
00105
00106 bool _exclude;
00107 bool _include;
00108 bool _defaultInclude;
00109
00110 private:
00111 ObjectHandler<ParsedDocument>* _handler;
00112 ParsedDocument _document;
00113
00114 void doParse();
00115 void writeToken(char *token, int start, int end);
00116 char start_tag[PARSER_MAX_BUF_SIZE];
00117 char end_tag[PARSER_MAX_BUF_SIZE];
00118 };
00119
00120 namespace TaggedTextTokenType {
00121 const int tag = 1;
00122 const int upword = 2;
00123 const int word = 3;
00124 const int contraction = 4;
00125 const int acronym = 5;
00126 const int acronym2 = 6;
00127 const int unknown = 7;
00128 };
00129
00130 #endif // INDRI_TAGGEDTEXTPARSER_HPP
00131
00132
00133
00134
00135