Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

WordDocumentExtractor.hpp

Go to the documentation of this file.
00001 /*==========================================================================
00002  * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
00003  *
00004  * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
00005  * is subject to the terms of the software license set forth in the LICENSE
00006  * file included with this software, and also available at
00007  * http://www.lemurproject.org/license.html
00008  *
00009  *==========================================================================
00010 */
00011 
00012 
00013 //
00014 // WordDocumentExtractor
00015 //
00016 // 14 June 2004 -- tds
00017 //
00018 // Code is based in part on the AutoWord class
00019 // by Poonam Bajaj.
00020 //
00021 
00022 #include "lemur-compat.hpp"
00023 #include "indri/Buffer.hpp"
00024 #include "indri/UnparsedDocument.hpp"
00025 #include "Exception.hpp"
00026 #include <string>
00027 #include "indri/DocumentIterator.hpp"
00028 
00029 #ifndef INDRI_WORDDOCUMENTEXTRACTOR_HPP
00030 #define INDRI_WORDDOCUMENTEXTRACTOR_HPP
00031 #ifdef WIN32
00032 class WordDocumentExtractor : public DocumentIterator {
00033   void* _internal;
00034 
00035   Buffer _documentTextBuffer;
00036   UnparsedDocument _unparsedDocument;
00037   std::string _documentPath;
00038   bool _documentWaiting;
00039 
00040   void initialize();
00041   void uninitialize();
00042   
00043 public:
00044   WordDocumentExtractor();
00045   ~WordDocumentExtractor();
00046 
00047   void open( const std::string& filename );
00048   UnparsedDocument* nextDocument( );
00049   void close();
00050 };
00051 #endif
00052 #endif // INDRI_WORDDOCUMENTEXTRACTOR_HPP

Generated on Wed Nov 3 12:59:07 2004 for Lemur Toolkit by doxygen1.2.18