Main Page   Compound List   File List   Compound Members   File Members  

evallm.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #define MAX_ARGS 200
00021 
00022 #include "evallm.h"
00023 #include "pc_libs/pc_general.h"
00024 #include "rr_libs/general.h"
00025 #include <stdio.h>
00026 #include <string.h>
00027 
00032 void main (int argc, char **argv) {
00033 
00034   ng_t ng;
00035   arpa_lm_t arpa_ng;
00036   char input_string[500];
00037   int num_of_args;
00038   char *args[MAX_ARGS];
00039   char *lm_filename_arpa;
00040   char *lm_filename_binary;
00041   flag told_to_quit;
00042   flag inconsistant_parameters;
00043   flag backoff_from_unk_inc;
00044   flag backoff_from_unk_exc;
00045   flag backoff_from_ccs_inc;
00046   flag backoff_from_ccs_exc;
00047   flag arpa_lm;
00048   flag binary_lm;
00049   flag include_unks;
00050   char *fb_list_filename;
00051   char *probs_stream_filename;
00052   char *annotation_filename;
00053   char *text_stream_filename;
00054   char *oov_filename;
00055   char *ccs_filename;
00056   double log_base;
00057   char wlist_entry[1024];
00058   char current_cc[200];
00059   int current_cc_id;
00060   FILE *context_cues_fp;
00061   int n;
00062 
00063   /* Process command line */
00064 
00065   report_version(&argc,argv);
00066 
00067   if (pc_flagarg(&argc, argv,"-help") || 
00068       argc == 1 || 
00069       (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
00070    fprintf(stderr,"evallm : Evaluate a language model.\n");
00071    fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
00072    fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
00073    exit(1);
00074   }
00075   
00076 
00077   lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));
00078 
00079   if (strcmp(lm_filename_arpa,"")) {
00080     arpa_lm = 1;
00081   }
00082   else {
00083     arpa_lm = 0;
00084   }
00085 
00086   lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));
00087 
00088   if (strcmp(lm_filename_binary,"")) {
00089     binary_lm = 1;
00090   }
00091   else {
00092     binary_lm = 0;
00093   }
00094 
00095   if (arpa_lm && binary_lm) {
00096     quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
00097   }
00098   
00099   if (!arpa_lm && !binary_lm) {
00100     quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
00101   }
00102 
00103   ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));
00104 
00105   if (binary_lm && strcmp(ccs_filename,"")) {
00106     fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
00107   }
00108 
00109   pc_report_unk_args(&argc,argv,2);
00110  
00111   /* Load language model */
00112 
00113   if (arpa_lm) {
00114     fprintf(stderr,"Reading in language model from file %s\n",
00115             lm_filename_arpa);
00116     load_arpa_lm(&arpa_ng,lm_filename_arpa);
00117   }
00118   else {
00119     fprintf(stderr,"Reading in language model from file %s\n",
00120             lm_filename_binary);
00121     load_lm(&ng,lm_filename_binary); 
00122   }
00123 
00124   fprintf(stderr,"\nDone.\n");
00125 
00126   if (!arpa_lm) {
00127     n=ng.n;
00128   }
00129   else {
00130     n=arpa_ng.n;
00131   }
00132 
00133   if (arpa_lm) {
00134     arpa_ng.context_cue = 
00135       (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
00136     arpa_ng.no_of_ccs = 0;
00137     if (strcmp(ccs_filename,"")) {
00138       context_cues_fp = rr_iopen(ccs_filename);
00139       while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
00140         if (strncmp(wlist_entry,"##",2)==0) continue;
00141         sscanf (wlist_entry, "%s ",current_cc);
00142         if (strncmp(wlist_entry,"#",1)==0) {
00143           fprintf(stderr,"\n\n===========================================================\n");
00144           fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00145           fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
00146           fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
00147           fprintf(stderr,     "         (comments must start with '##')\n");
00148           fprintf(stderr,"===========================================================\n\n");
00149         }
00150         
00151         
00152         if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
00153           quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
00154         }
00155         
00156         arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
00157         arpa_ng.no_of_ccs++;
00158         fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
00159       }
00160       rr_iclose(context_cues_fp);
00161     }
00162   }
00163 
00164   /* Process commands */
00165   
00166   told_to_quit = 0;
00167   num_of_args = 0;
00168 
00169   while (!feof(stdin) && !told_to_quit) {
00170     printf("evallm : ");
00171     gets(input_string);
00172 
00173     if (!feof(stdin)) {
00174       parse_comline(input_string,&num_of_args,args);
00175 
00176       log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);
00177 
00178       backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
00179                                         "-backoff_from_unk_inc");
00180       backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
00181                                         "-backoff_from_ccs_inc");
00182       backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
00183                                         "-backoff_from_unk_exc");
00184       backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
00185                                         "-backoff_from_ccs_exc");
00186       include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
00187       fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
00188                                              "-backoff_from_list",""));
00189     
00190       text_stream_filename = 
00191         salloc(pc_stringarg(&num_of_args,args,"-text",""));
00192       probs_stream_filename = 
00193         salloc(pc_stringarg(&num_of_args,args,"-probs",""));
00194       annotation_filename = 
00195         salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
00196       oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));
00197 
00198 
00199       inconsistant_parameters = 0;
00200     
00201       if (backoff_from_unk_inc && backoff_from_unk_exc) {
00202         fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
00203         fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
00204         inconsistant_parameters = 1;
00205       }
00206 
00207       if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
00208         fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
00209         fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
00210         inconsistant_parameters = 1;
00211       }
00212 
00213       if (num_of_args > 0) {
00214       
00215         if (!inconsistant_parameters) {
00216           if (!strcmp(args[0],"perplexity")) {
00217             compute_perplexity(&ng,
00218                                &arpa_ng,
00219                                text_stream_filename,
00220                                probs_stream_filename,
00221                                annotation_filename,
00222                                oov_filename,
00223                                fb_list_filename,
00224                                backoff_from_unk_inc,
00225                                backoff_from_unk_exc,
00226                                backoff_from_ccs_inc,
00227                                backoff_from_ccs_exc,
00228                                arpa_lm,
00229                                include_unks,
00230                                log_base);
00231           }
00232           else {
00233             if (!strcmp(args[0],"validate")) {
00234 
00235               if (num_of_args != n) {
00236                 fprintf(stderr,"Error : must specify %d words of context.\n",
00237                         n-1);
00238               }
00239               else {
00240               
00241                 /* Assume last n-1 parameters form context */
00242               
00243                 validate(&ng,
00244                          &arpa_ng,
00245                          &(args[num_of_args-n+1]),
00246                          backoff_from_unk_inc,
00247                          backoff_from_unk_exc,
00248                          backoff_from_ccs_inc,
00249                          backoff_from_ccs_exc,
00250                          arpa_lm,
00251                          fb_list_filename);
00252               }
00253             }
00254             else {
00255               if (!strcmp(args[0],"stats")) {
00256                 if (arpa_lm) {
00257                   display_arpa_stats(&arpa_ng);
00258                 }
00259                 else {
00260                   display_stats(&ng);
00261                 }
00262               }
00263               else {
00264                 if (!strcmp(args[0],"quit")) {
00265                   told_to_quit=1;
00266                 }
00267                 else {
00268                   if (!strcmp(args[0],"help")) {
00269 
00270                     printf("The user may specify one of the following commands: \n");
00271                     printf("\n");
00272                     printf(" - perplexity\n");
00273                     printf("\n");
00274                     printf("Computes the perplexity of a given text. May optionally specify words\n");
00275                     printf("from which to force back-off.\n");
00276                     printf("\n");
00277                     printf("Syntax: \n");
00278                     printf("\n");
00279                     printf("perplexity -text .text\n");
00280                     printf("         [ -probs .fprobs ]\n");
00281                     printf("         [ -oovs .oov_file ]\n");
00282                     printf("         [ -annotate .annotation_file ]         \n");
00283                     printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
00284                     printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
00285                     printf("         [ -backoff_from_list .fblist ]\n");
00286                     printf("         [ -include_unks ]\n");
00287                     printf("\n");
00288                     printf(" - validate\n");
00289                     printf("       \n");
00290                     printf("Calculate the sum of the probabilities of all the words in the\n");
00291                     printf("vocabulary given the context specified by the user.\n");
00292                     printf("\n");
00293                     printf("Syntax: \n");
00294                     printf("\n");
00295                     printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
00296                     printf("           -backoff_from_list .fblist ]\n");
00297                     printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
00298                     printf("           word1 word2 ... word_(n-1)\n");
00299                     printf("\n");
00300                     printf("Where n is the n in n-gram. \n");
00301                     printf("\n");
00302                     printf(" - help\n");
00303                     printf("\n");
00304                     printf("Displays this help message.\n");
00305                     printf("\n");
00306                     printf("Syntax: \n");
00307                     printf("\n");
00308                     printf("help\n");
00309                     printf("\n");
00310                     printf(" - quit\n");
00311                     printf("\n");
00312                     printf("Exits the program.\n");
00313                     printf("\n");
00314                     printf("Syntax: \n");
00315                     printf("\n");
00316                     printf("quit\n");   
00317 
00318                   } 
00319               
00320                   else {
00321                     fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
00322                             args[0]);
00323                   }
00324                 }
00325               }
00326             }
00327           }
00328         }
00329       }
00330     }    
00331   }
00332 
00333   fprintf(stderr,"evallm : Done.\n");
00334 
00335   exit(0);
00336   
00337 }
00338 
00339     
00340     
00341     
00342 

Generated on Tue Dec 21 13:54:45 2004 by doxygen1.2.18