00001 /*========================================================================== 00002 * Copyright (c) 2001 Carnegie Mellon University. All Rights Reserved. 00003 * 00004 * Use of the Lemur Toolkit for Language Modeling and Information Retrieval 00005 * is subject to the terms of the software license set forth in the LICENSE 00006 * file included with this software, and also available at 00007 * http://www.lemurproject.org/license.html 00008 * 00009 *========================================================================== 00010 */ 00011 00012 00013 /* type definitions for objects we will use */ 00014 #ifndef _INVFPTYPES_H 00015 #define _INVFPTYPES_H 00016 00017 #define IND_VERSION "3.1" 00018 00019 #include "common_headers.hpp" 00020 #include "IndexTypes.hpp" 00021 00022 // suffixes for filenames 00023 #define INVINDEX ".invf" 00024 #define INVFPINDEX ".invfp" 00025 #define INVLOOKUP ".invlookup" 00026 #define DTINDEX ".dt" 00027 #define DTLOOKUP ".dtlookup" 00028 #define TERMIDMAP ".tid" 00029 #define TERMIDSTRMAP ".tidstr" 00030 #define DOCIDMAP ".did" 00031 #define DOCIDSTRMAP ".didstr" 00032 #define MAINTOC ".inv" 00033 #define INVFPTOC ".ifp" 00034 #define DOCMGRMAP ".dm" 00035 00036 // what to call out of vocabulary ids 00037 #define INVALID_STR "[OOV]" 00038 00039 // name for parameters 00040 #define VERSION_PAR "VERSION" 00041 #define NUMDOCS_PAR "NUM_DOCS" 00042 #define NUMTERMS_PAR "NUM_TERMS" 00043 #define NUMUTERMS_PAR "NUM_UNIQUE_TERMS" 00044 #define AVEDOCLEN_PAR "AVE_DOCLEN" 00045 #define INVINDEX_PAR "INV_INDEX" 00046 #define INVLOOKUP_PAR "INV_LOOKUP" 00047 #define DTINDEX_PAR "DT_INDEX" 00048 #define DTLOOKUP_PAR "DT_LOOKUP" 00049 #define TERMIDMAP_PAR "TERMIDS" 00050 #define TERMIDSTRMAP_PAR "TERMIDSTRS" 00051 #define DOCIDMAP_PAR "DOCIDS" 00052 #define DOCIDSTRMAP_PAR "DOCIDSTRS" 00053 #define NUMDT_PAR "NUM_DTFILES" 00054 #define NUMINV_PAR "NUM_INVFILES" 00055 #define DOCMGR_PAR "DOCMGR_IDS" 00056 00057 struct LocatedTerm { // pair of term and its location 00058 TERMID_T term; 00059 LOC_T loc; 00060 }; 00061 00062 struct LLTerm { // pair of term and list of locations 00063 TERMID_T term; 00064 vector<LOC_T> loc; 00065 }; 00066 00067 struct dt_entry { // an entry in the lookup table for docterm lists index 00068 FILEID_T fileid; // which file the word is in 00069 long offset; // what the offset into the file is 00070 int length; // the length of the inverted list 00071 int docmgr; // the docmgr id of manager for this doc 00072 }; 00073 00074 struct inv_entry { // an entry in the lookup table for docterm lists index 00075 FILEID_T fileid; // which file the word is in 00076 long offset; // what the offset into the file is 00077 COUNT_T ctf; // collection term freq 00078 COUNT_T df; // doc freq 00079 }; 00080 00081 struct ltstr 00082 { 00083 bool operator()(char* s1, char* s2) const{ 00084 return strcmp(s1, s2) < 0; 00085 } 00086 }; 00087 00088 #endif