00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 #ifndef INDRI_RAWTEXTPARSER_HPP
00020 #define INDRI_RAWTEXTPARSER_HPP
00021
00022 #define PARSER_MAX_WORD_LENGTH (30)
00023
00024 class RawTextParser {
00025 private:
00026 std::ifstream _in;
00027 char* _buffer;
00028 char* _current;
00029 int _bufferSize;
00030
00031 public:
00032 RawTextParser( int memorySize = 1024*1024 ) {
00033 _bufferSize = memorySize;
00034 _buffer = new char[_bufferSize];
00035 }
00036
00037 ~RawTextParser() {
00038 delete _buffer;
00039 }
00040
00041 bool open( const std::string& fileName ) {
00042 _in.open( fileName.c_str(), std::ifstream::in );
00043 return _in.good();
00044 }
00045
00046 void close() {
00047 _in.close();
00048 }
00049
00053
00054 bool parseDocument( std::string& docName, greedy_vector<char*>& words ) {
00055 static const char docPrefix[] = "<DOC>";
00056 static const char endDocPrefix[] = "</DOC>";
00057 static const char docnoPrefix[] = "<DOCNO ";
00058 static const char urlPrefix[] = "<URL";
00059 bool gotDocID = false;
00060 int bufferPos = 0;
00061
00062 while( 1 ) {
00063 int remainingSpace = _bufferSize - bufferPos;
00064 _in.getline( _buffer + bufferPos, remainingSpace );
00065 int length = _in.gcount();
00066
00067 if( _in.rdstate() & (std::ifstream::failbit|std::ifstream::eofbit) ) {
00068 if( _in.rdstate() & std::ifstream::eofbit ) {
00069 return false;
00070 }
00071
00072 if( length == remainingSpace - 1 ) {
00073 throw Exception( "RawTextParser", "Buffer size is too small to handle some document in the corpus, use -parserMemory to change." );
00074 }
00075
00076 if( _in.rdstate() & std::ifstream::failbit ) {
00077 throw Exception( "RawTextParser", "Unable to recover from failed read" );
00078 }
00079 }
00080
00081 _buffer[bufferPos+length] = 0;
00082 char* line = _buffer + bufferPos;
00083
00084 if( length && _buffer[bufferPos] == '<' ) {
00085 if( length > sizeof docnoPrefix-1 && !strncmp( docnoPrefix, line, sizeof docnoPrefix-1 ) ) {
00086 docName.assign( line+sizeof docnoPrefix-1, line + length - 2 );
00087 gotDocID = true;
00088 } else if ( length > sizeof endDocPrefix-1 && !strncmp( endDocPrefix, line, sizeof endDocPrefix-1 ) ) {
00089
00090 if( gotDocID )
00091 return true;
00092 }
00093 } else {
00094 if( !gotDocID )
00095 continue;
00096
00097 int i = 0;
00098
00099 while(1) {
00100 for( ; isspace(line[i]) && i<length && line[i]; i++ )
00101 ;
00102
00103 if( i>= length || !line[i] )
00104 break;
00105
00106 char* begin = &line[i];
00107
00108 for( ; !isspace(line[i]) && i<length && line[i]; i++ )
00109 ;
00110
00111 line[i] = 0;
00112 i++;
00113
00114 if( &line[i] - begin > PARSER_MAX_WORD_LENGTH )
00115 begin[PARSER_MAX_WORD_LENGTH-1] = 0;
00116
00117 words.push_back(begin);
00118 }
00119 }
00120
00121 bufferPos += length + 1;
00122 }
00123 }
00124 };
00125
00126 #endif // INDRI_RAWTEXTPARSER_HPP