00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #define MAX_ARGS 200
00021
00022 #include "evallm.h"
00023 #include "pc_libs/pc_general.h"
00024 #include "rr_libs/general.h"
00025 #include <stdio.h>
00026 #include <string.h>
00027
00032 void main (int argc, char **argv) {
00033
00034 ng_t ng;
00035 arpa_lm_t arpa_ng;
00036 char input_string[500];
00037 int num_of_args;
00038 char *args[MAX_ARGS];
00039 char *lm_filename_arpa;
00040 char *lm_filename_binary;
00041 flag told_to_quit;
00042 flag inconsistant_parameters;
00043 flag backoff_from_unk_inc;
00044 flag backoff_from_unk_exc;
00045 flag backoff_from_ccs_inc;
00046 flag backoff_from_ccs_exc;
00047 flag arpa_lm;
00048 flag binary_lm;
00049 flag include_unks;
00050 char *fb_list_filename;
00051 char *probs_stream_filename;
00052 char *annotation_filename;
00053 char *text_stream_filename;
00054 char *oov_filename;
00055 char *ccs_filename;
00056 double log_base;
00057 char wlist_entry[1024];
00058 char current_cc[200];
00059 int current_cc_id;
00060 FILE *context_cues_fp;
00061 int n;
00062
00063
00064
00065 report_version(&argc,argv);
00066
00067 if (pc_flagarg(&argc, argv,"-help") ||
00068 argc == 1 ||
00069 (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
00070 fprintf(stderr,"evallm : Evaluate a language model.\n");
00071 fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
00072 fprintf(stderr," -arpa .arpa [ -context .ccs ] ]\n");
00073 exit(1);
00074 }
00075
00076
00077 lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));
00078
00079 if (strcmp(lm_filename_arpa,"")) {
00080 arpa_lm = 1;
00081 }
00082 else {
00083 arpa_lm = 0;
00084 }
00085
00086 lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));
00087
00088 if (strcmp(lm_filename_binary,"")) {
00089 binary_lm = 1;
00090 }
00091 else {
00092 binary_lm = 0;
00093 }
00094
00095 if (arpa_lm && binary_lm) {
00096 quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
00097 }
00098
00099 if (!arpa_lm && !binary_lm) {
00100 quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
00101 }
00102
00103 ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));
00104
00105 if (binary_lm && strcmp(ccs_filename,"")) {
00106 fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
00107 }
00108
00109 pc_report_unk_args(&argc,argv,2);
00110
00111
00112
00113 if (arpa_lm) {
00114 fprintf(stderr,"Reading in language model from file %s\n",
00115 lm_filename_arpa);
00116 load_arpa_lm(&arpa_ng,lm_filename_arpa);
00117 }
00118 else {
00119 fprintf(stderr,"Reading in language model from file %s\n",
00120 lm_filename_binary);
00121 load_lm(&ng,lm_filename_binary);
00122 }
00123
00124 fprintf(stderr,"\nDone.\n");
00125
00126 if (!arpa_lm) {
00127 n=ng.n;
00128 }
00129 else {
00130 n=arpa_ng.n;
00131 }
00132
00133 if (arpa_lm) {
00134 arpa_ng.context_cue =
00135 (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));
00136 arpa_ng.no_of_ccs = 0;
00137 if (strcmp(ccs_filename,"")) {
00138 context_cues_fp = rr_iopen(ccs_filename);
00139 while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
00140 if (strncmp(wlist_entry,"##",2)==0) continue;
00141 sscanf (wlist_entry, "%s ",current_cc);
00142 if (strncmp(wlist_entry,"#",1)==0) {
00143 fprintf(stderr,"\n\n===========================================================\n");
00144 fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00145 fprintf(stderr, ">>> %s <<<\n",wlist_entry);
00146 fprintf(stderr, " '%s' will be included in the context cues list\n",current_cc);
00147 fprintf(stderr, " (comments must start with '##')\n");
00148 fprintf(stderr,"===========================================================\n\n");
00149 }
00150
00151
00152 if (sih_lookup(arpa_ng.vocab_ht,current_cc,¤t_cc_id) == 0) {
00153 quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
00154 }
00155
00156 arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
00157 arpa_ng.no_of_ccs++;
00158 fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
00159 }
00160 rr_iclose(context_cues_fp);
00161 }
00162 }
00163
00164
00165
00166 told_to_quit = 0;
00167 num_of_args = 0;
00168
00169 while (!feof(stdin) && !told_to_quit) {
00170 printf("evallm : ");
00171 gets(input_string);
00172
00173 if (!feof(stdin)) {
00174 parse_comline(input_string,&num_of_args,args);
00175
00176 log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);
00177
00178 backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
00179 "-backoff_from_unk_inc");
00180 backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
00181 "-backoff_from_ccs_inc");
00182 backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
00183 "-backoff_from_unk_exc");
00184 backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
00185 "-backoff_from_ccs_exc");
00186 include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
00187 fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
00188 "-backoff_from_list",""));
00189
00190 text_stream_filename =
00191 salloc(pc_stringarg(&num_of_args,args,"-text",""));
00192 probs_stream_filename =
00193 salloc(pc_stringarg(&num_of_args,args,"-probs",""));
00194 annotation_filename =
00195 salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
00196 oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));
00197
00198
00199 inconsistant_parameters = 0;
00200
00201 if (backoff_from_unk_inc && backoff_from_unk_exc) {
00202 fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
00203 fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
00204 inconsistant_parameters = 1;
00205 }
00206
00207 if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
00208 fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
00209 fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
00210 inconsistant_parameters = 1;
00211 }
00212
00213 if (num_of_args > 0) {
00214
00215 if (!inconsistant_parameters) {
00216 if (!strcmp(args[0],"perplexity")) {
00217 compute_perplexity(&ng,
00218 &arpa_ng,
00219 text_stream_filename,
00220 probs_stream_filename,
00221 annotation_filename,
00222 oov_filename,
00223 fb_list_filename,
00224 backoff_from_unk_inc,
00225 backoff_from_unk_exc,
00226 backoff_from_ccs_inc,
00227 backoff_from_ccs_exc,
00228 arpa_lm,
00229 include_unks,
00230 log_base);
00231 }
00232 else {
00233 if (!strcmp(args[0],"validate")) {
00234
00235 if (num_of_args != n) {
00236 fprintf(stderr,"Error : must specify %d words of context.\n",
00237 n-1);
00238 }
00239 else {
00240
00241
00242
00243 validate(&ng,
00244 &arpa_ng,
00245 &(args[num_of_args-n+1]),
00246 backoff_from_unk_inc,
00247 backoff_from_unk_exc,
00248 backoff_from_ccs_inc,
00249 backoff_from_ccs_exc,
00250 arpa_lm,
00251 fb_list_filename);
00252 }
00253 }
00254 else {
00255 if (!strcmp(args[0],"stats")) {
00256 if (arpa_lm) {
00257 display_arpa_stats(&arpa_ng);
00258 }
00259 else {
00260 display_stats(&ng);
00261 }
00262 }
00263 else {
00264 if (!strcmp(args[0],"quit")) {
00265 told_to_quit=1;
00266 }
00267 else {
00268 if (!strcmp(args[0],"help")) {
00269
00270 printf("The user may specify one of the following commands: \n");
00271 printf("\n");
00272 printf(" - perplexity\n");
00273 printf("\n");
00274 printf("Computes the perplexity of a given text. May optionally specify words\n");
00275 printf("from which to force back-off.\n");
00276 printf("\n");
00277 printf("Syntax: \n");
00278 printf("\n");
00279 printf("perplexity -text .text\n");
00280 printf(" [ -probs .fprobs ]\n");
00281 printf(" [ -oovs .oov_file ]\n");
00282 printf(" [ -annotate .annotation_file ] \n");
00283 printf(" [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
00284 printf(" [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
00285 printf(" [ -backoff_from_list .fblist ]\n");
00286 printf(" [ -include_unks ]\n");
00287 printf("\n");
00288 printf(" - validate\n");
00289 printf(" \n");
00290 printf("Calculate the sum of the probabilities of all the words in the\n");
00291 printf("vocabulary given the context specified by the user.\n");
00292 printf("\n");
00293 printf("Syntax: \n");
00294 printf("\n");
00295 printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
00296 printf(" -backoff_from_list .fblist ]\n");
00297 printf(" [ -forced_backoff_inc | -forced_back_off_exc ] \n");
00298 printf(" word1 word2 ... word_(n-1)\n");
00299 printf("\n");
00300 printf("Where n is the n in n-gram. \n");
00301 printf("\n");
00302 printf(" - help\n");
00303 printf("\n");
00304 printf("Displays this help message.\n");
00305 printf("\n");
00306 printf("Syntax: \n");
00307 printf("\n");
00308 printf("help\n");
00309 printf("\n");
00310 printf(" - quit\n");
00311 printf("\n");
00312 printf("Exits the program.\n");
00313 printf("\n");
00314 printf("Syntax: \n");
00315 printf("\n");
00316 printf("quit\n");
00317
00318 }
00319
00320 else {
00321 fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
00322 args[0]);
00323 }
00324 }
00325 }
00326 }
00327 }
00328 }
00329 }
00330 }
00331 }
00332
00333 fprintf(stderr,"evallm : Done.\n");
00334
00335 exit(0);
00336
00337 }
00338
00339
00340
00341
00342