00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include <cstring>
00013 #include "common_headers.hpp"
00014
00015 #ifndef NULL
00016 #define NULL 0
00017 #endif
00018
00019 #ifndef _TEXTHANDLER_HPP
00020 #define _TEXTHANDLER_HPP
00021
00022 #include "PropertyList.hpp"
00023
00024
00025 #define MAXWORDSIZE 1024
00026
00045
00046
00052
00053
00054
00055 #include <cstdio>
00056
00057 class TextHandler {
00058
00059 public:
00060 enum TokenType {BEGINDOC = 1, ENDDOC = 2, WORDTOK = 3,
00061 BEGINTAG = 4, ENDTAG = 5, SYMBOLTOK = 6};
00062 static const string category;
00063 static const string identifier;
00064
00065 TextHandler() {
00066 textHandler = NULL;
00067 buffer[MAXWORDSIZE-1] = '\0';
00068 cat = category;
00069 iden = identifier;
00070 }
00071 virtual ~TextHandler() {}
00072
00074 virtual void setTextHandler(TextHandler * th) {
00075 textHandler = th;
00076 }
00078 virtual TextHandler * getTextHandler() {
00079 return textHandler;
00080 }
00081
00082 virtual void foundToken(TokenType type,
00083 char * token = NULL,
00084 const char * orig = NULL,
00085 PropertyList * properties = NULL) {
00086 char * t = NULL;
00087
00088 if (token != NULL) {
00089 strncpy(buffer, token, MAXWORDSIZE - 1);
00090 t = buffer;
00091 }
00092
00093 switch (type) {
00094
00095 case BEGINDOC:
00096 t = handleBeginDoc(t, orig, properties);
00097 break;
00098 case ENDDOC:
00099 t = handleEndDoc(t, orig, properties);
00100 break;
00101 case WORDTOK:
00102 t = handleWord(t, orig, properties);
00103 break;
00104 case BEGINTAG:
00105 t = handleBeginTag(t, orig, properties);
00106 break;
00107 case ENDTAG:
00108 t = handleEndTag(t, orig, properties);
00109 break;
00110 case SYMBOLTOK:
00111 t = handleSymbol(t, orig, properties);
00112 break;
00113 }
00114
00115 if (textHandler != NULL) {
00116 textHandler->foundToken(type, t, orig, properties);
00117 }
00118 }
00119
00122 virtual char * handleBeginDoc(char * docno, const char * original,
00123 PropertyList * list) {
00124 return handleDoc(docno);
00125 }
00128 virtual char * handleEndDoc(char * token, const char * original,
00129 PropertyList * list) {
00130 handleEndDoc();
00131 return token;
00132 }
00135 virtual char * handleWord(char * word, const char * original,
00136 PropertyList * list) {
00137 return handleWord(word);
00138 }
00140 virtual char * handleBeginTag(char * tag, const char * original,
00141 PropertyList * list) {
00142 return tag;
00143 }
00145 virtual char * handleEndTag(char * tag, const char * original,
00146 PropertyList * list) {
00147 return tag;
00148 }
00149
00152 virtual char * handleSymbol(char * symbol, const char * original,
00153 PropertyList * list) {
00154 return handleSymbol(symbol);
00155 }
00156
00157
00158
00159
00161
00162 foundToken(BEGINDOC, docno, docno);
00163 }
00164 virtual void foundDoc(char * docno, const char * original) {
00165 foundToken(BEGINDOC, docno, original);
00166 }
00168 virtual void foundWord(char * word) {
00169 foundToken(WORDTOK, word, word);
00170 }
00171 virtual void foundWord(char * word, const char * original) {
00172 foundToken(WORDTOK, word, original);
00173 }
00175 virtual void foundEndDoc() {
00176 foundToken(ENDDOC);
00177 }
00179 virtual void foundSymbol(char * sym) {
00180 foundToken(SYMBOLTOK, sym, sym);
00181 }
00182
00184
00186
00188
00190
00191
00193 virtual string getCategory() { return cat; }
00195 virtual string getIdentifier() { return iden; }
00196 protected:
00198 TextHandler * textHandler;
00199 string cat;
00200 string iden;
00201
00202 char buffer[MAXWORDSIZE];
00203 };
00204
00205
00206 #endif
00207