00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 #ifndef _BASICFILESTREAM_HPP 00014 #define _BASICFILESTREAM_HPP 00015 00017 00055 #include "common_headers.hpp" 00056 #include <cassert> 00057 #include <cstdio> 00058 #include <cstring> 00059 #include "DocStream.hpp" 00060 #include "Exception.hpp" 00061 00062 00063 #define MAXLINE 65536 00064 00065 00067 00068 class BasicTokenDoc : public Document { 00069 public: 00070 BasicTokenDoc() { 00071 } 00072 BasicTokenDoc(ifstream *stream): docStr(stream) { 00073 } 00074 void startTermIteration() const; 00075 00076 const char *getID() const { return id;} 00077 00078 bool hasMore() const{ return (strcmp(curWord, "</DOC>") != 0);} 00079 00080 const Term * nextTerm() const; 00081 00082 void skipToEnd() const; 00083 friend class BasicDocStream; 00084 private: 00085 void readID(); 00086 mutable char *curWord; 00087 mutable char buf1[20000]; 00088 mutable char buf2[20000]; 00089 char id[2000]; 00090 ifstream *docStr; 00091 streampos startPos; // starting position of the terms in the file 00092 //replace static BasicTokenTerm t; with attribute 00093 mutable Term t; 00094 }; 00095 00096 00098 class BasicDocStream : public DocStream 00099 { 00100 public: 00101 BasicDocStream() {} 00102 BasicDocStream (const string &inputFile); 00103 00104 virtual ~BasicDocStream() { delete ifs;} 00105 00106 public: 00107 00108 bool hasMore(); 00109 00110 void startDocIteration(); 00111 00112 Document *nextDoc(); 00113 00114 private: 00115 char file[1024]; 00116 ifstream *ifs; 00117 char buf[2000]; 00118 bool nextTokenRead; 00119 // replace static BasicTokenDoc doc; with attribute 00120 BasicTokenDoc doc; 00121 }; 00122 00123 00124 00125 00126 #endif 00127 00128 00129 00130