Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

RawTextParser.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // RawTextParser
00015 //
00016 // 10 February 2004 -- tds
00017 //
00018 
00019 #ifndef INDRI_RAWTEXTPARSER_HPP
00020 #define INDRI_RAWTEXTPARSER_HPP
00021 
00022 #define PARSER_MAX_WORD_LENGTH (30)
00023 
00024 class RawTextParser {
00025 private:
00026   std::ifstream _in;
00027   char* _buffer;
00028   char* _current;
00029   int _bufferSize;
00030 
00031 public:
00032   RawTextParser( int memorySize = 1024*1024 ) {
00033     _bufferSize = memorySize;
00034     _buffer = new char[_bufferSize]; // let's hope that 1mb is enough for any docs we're gonna see
00035   }
00036 
00037   ~RawTextParser() {
00038     delete _buffer;
00039   }
00040 
00041   bool open( const std::string& fileName ) {
00042     _in.open( fileName.c_str(), std::ifstream::in );
00043     return _in.good();
00044   }
00045 
00046   void close() {
00047     _in.close();
00048   }
00049 
00053 
00054   bool parseDocument( std::string& docName, greedy_vector<char*>& words ) { 
00055     static const char docPrefix[] = "<DOC>";
00056     static const char endDocPrefix[] = "</DOC>";
00057     static const char docnoPrefix[] = "<DOCNO ";
00058     static const char urlPrefix[] = "<URL";
00059     bool gotDocID = false;
00060     int bufferPos = 0;
00061 
00062     while( 1 ) {
00063       int remainingSpace = _bufferSize - bufferPos;
00064       _in.getline( _buffer + bufferPos, remainingSpace );
00065       int length = _in.gcount();
00066 
00067       if( _in.rdstate() & (std::ifstream::failbit|std::ifstream::eofbit) ) {
00068         if( _in.rdstate() & std::ifstream::eofbit ) {
00069           return false; // at end of file, we're done
00070         }
00071         
00072         if( length == remainingSpace - 1 ) {
00073           throw Exception( "RawTextParser", "Buffer size is too small to handle some document in the corpus, use -parserMemory to change." );
00074         }
00075 
00076         if( _in.rdstate() & std::ifstream::failbit ) {
00077           throw Exception( "RawTextParser", "Unable to recover from failed read" );
00078         }
00079       }
00080 
00081       _buffer[bufferPos+length] = 0;
00082       char* line = _buffer + bufferPos;
00083 
00084       if( length && _buffer[bufferPos] == '<' ) {
00085         if( length > sizeof docnoPrefix-1 && !strncmp( docnoPrefix, line, sizeof docnoPrefix-1 ) ) {
00086           docName.assign( line+sizeof docnoPrefix-1, line + length - 2 );
00087           gotDocID = true;
00088         } else if ( length > sizeof endDocPrefix-1 && !strncmp( endDocPrefix, line, sizeof endDocPrefix-1 ) ) {
00089           // handle end doc -- return
00090           if( gotDocID )
00091             return true;
00092         }
00093       } else {
00094         if( !gotDocID )
00095           continue;
00096 
00097         int i = 0;
00098 
00099         while(1) {
00100           for( ; isspace(line[i]) && i<length && line[i]; i++ )
00101             ;
00102 
00103           if( i>= length || !line[i] )
00104             break;
00105 
00106           char* begin = &line[i];
00107 
00108           for( ; !isspace(line[i]) && i<length && line[i]; i++ )
00109             ;
00110 
00111           line[i] = 0;
00112           i++;
00113 
00114           if( &line[i] - begin > PARSER_MAX_WORD_LENGTH )
00115             begin[PARSER_MAX_WORD_LENGTH-1] = 0;
00116 
00117           words.push_back(begin);
00118         }
00119       }
00120 
00121       bufferPos += length + 1; // have to skip trailing \0
00122     }
00123   }
00124 };
00125 
00126 #endif // INDRI_RAWTEXTPARSER_HPP

Generated on Wed Nov 3 12:59:03 2004 for Lemur Toolkit by doxygen1.2.18