compute_unigram.c Source File

00001 /*=====================================================================
00002                 =======   COPYRIGHT NOTICE   =======
00003 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00004 Ronald Rosenfeld and Philip Clarkson.
00005 
00006 All rights reserved.
00007 
00008 This software is made available for research purposes only.  It may be
00009 redistributed freely for this purpose, in full or in part, provided
00010 that this entire copyright notice is included on any copies of this
00011 software and applications and derivations thereof.
00012 
00013 This software is provided on an "as is" basis, without warranty of any
00014 kind, either expressed or implied, as to any matter including, but not
00015 limited to warranty of fitness of purpose, or merchantability, or
00016 results obtained from use of this software.
00017 ======================================================================*/
00018 
00019 /* Calculate the probabilities for each 1-gram */
00020 
00021 /* Basically copied from version 1 */
00022 
00023 
00061 #include <math.h>
00062 #include <stdio.h>
00063 #include "ngram.h"
00064 #include "idngram2lm.h"
00065 #include "pc_libs/pc_general.h"
00066 
00067 void compute_unigram(ng_t *ng,int verbosity) {
00068 
00069   int i;
00070   int count;
00071   int n_zerotons;
00072   int num_of_types;
00073   double floatN;
00074   double prob;
00075   double total_prob;
00076   double discount_mass;
00077   double total_zeroton_mass;
00078   double prob_zeroton;
00079   double prob_singleton;
00080   double leftover_mass;
00081 
00082   /* Make sure that we don't have a type 2 vocab and an UNK */
00083 
00084   if (ng->vocab_type==OPEN_VOCAB_2 && return_count(ng->four_byte_counts,
00085                                                    ng->count_table[0],
00086                                                    ng->count[0],
00087                                                    ng->count4[0],
00088                                                    0) != 0) {
00089     quit(-1,"Error : Open vocabulary type 2 requested, but there were OOVs in the \ntraining data.\n");
00090   }
00091 
00092   if (ng->vocab_type == CLOSED_VOCAB) {
00093     ng->uni_probs[0] = 1e-99;
00094   }
00095 
00096   /* Make sure all context cues have a zero count */
00097 
00098   if (ng->no_of_ccs > 0) {
00099     for (i=ng->first_id;i<=ng->vocab_size;i++) {
00100       if (ng->context_cue[i] && return_count(ng->four_byte_counts,
00101                                              ng->count_table[0],
00102                                              ng->count[0],
00103                                              ng->count4[0],
00104                                              i) != 0) {
00105         quit(-1,"Error : Context cue word has a non zero count.\n");
00106       }
00107     }
00108   }
00109 
00110   /* Compute the discounted unigram, and the total */
00111 
00112   floatN = (double) ng->n_unigrams;
00113 
00114   total_prob = 0.0;
00115 
00116   num_of_types = 0;
00117 
00118   for (i=ng->first_id;i<=ng->vocab_size;i++) {
00119     if (return_count(ng->four_byte_counts,
00120                      ng->count_table[0],
00121                      ng->count[0],
00122                      ng->count4[0],
00123                      i) > 0) {
00124       num_of_types++;
00125     }
00126   }
00127 
00128 
00129   for (i=ng->first_id;i<=ng->vocab_size;i++) {
00130     
00131     count = return_count(ng->four_byte_counts,
00132                          ng->count_table[0],
00133                          ng->count[0],
00134                          ng->count4[0],
00135                          i);
00136     prob = count/floatN;
00137     switch (ng->discounting_method) {
00138     case GOOD_TURING:
00139       if (count > 0 && count <= ng->disc_range[0]) {
00140         prob *= ng->gt_disc_ratio[0][count];
00141       }
00142       else {
00143         if (count == 0) {
00144           prob = 1e-99;
00145         }
00146       }
00147       break;
00148     case LINEAR:
00149       if (count > 0) {
00150         prob *= ng->lin_disc_ratio[0];
00151       }
00152       else {
00153         prob = 1e-99;
00154       }
00155       break;
00156     case ABSOLUTE:
00157       if (count > 0) {
00158         prob *= (count - ng->abs_disc_const[0])/count;
00159       }
00160       else {
00161         prob = 1e-99;
00162       }
00163       break;
00164     case WITTEN_BELL:
00165       if (count > 0) {
00166         prob *= floatN/(floatN+num_of_types);
00167       }
00168       else {
00169         prob = 1e-99;
00170       }
00171       break;
00172     }
00173     pc_message(verbosity,4,"   prob[%d] = %.8g count = %d \n",i,prob,count);
00174     ng->uni_probs[i] = prob;
00175     total_prob += prob;
00176   }
00177 
00178   /* Compute the discount mass */
00179 
00180   discount_mass = 1.0 - total_prob;
00181 
00182   pc_message(verbosity,2,"Unigrams's discount mass is %g (n1/N = %g)\n",
00183              discount_mass,ng->freq_of_freq[0][1]/floatN);
00184 
00185   if (discount_mass < 1e-10 && discount_mass != 0.0) {
00186     discount_mass = 0.0;
00187     pc_message(verbosity,2,"Discount mass was rounded to zero.\n");
00188   }
00189   
00190   /* Compute P(zeroton) & assign it to all zerotons (except context
00191      cues) */
00192 
00193   leftover_mass = discount_mass;
00194   n_zerotons = ng->freq_of_freq[0][0] - ng->no_of_ccs;
00195 
00196   if ((n_zerotons > 0) && (discount_mass > 0.0)) {
00197     total_zeroton_mass = discount_mass;
00198     if (ng->vocab_type == OPEN_VOCAB_2) {
00199       total_zeroton_mass = (1.0 - ng->oov_fraction)*discount_mass;
00200     }
00201     prob_zeroton = total_zeroton_mass / n_zerotons;
00202     prob_singleton = 1 / floatN;
00203     switch (ng->discounting_method) {
00204     case GOOD_TURING:
00205       if (ng->disc_range[0] >= 1) {
00206         prob_singleton *= ng->gt_disc_ratio[0][1];
00207       }
00208       break;
00209     case LINEAR:
00210       prob_singleton *= ng->lin_disc_ratio[0];
00211       break;
00212     case ABSOLUTE:
00213       prob_singleton *= (1-ng->abs_disc_const[0]);
00214       break;
00215     case WITTEN_BELL:
00216       prob_singleton *= floatN/(floatN + num_of_types);
00217       break;
00218     }
00219     pc_message(verbosity,2,"%d zerotons, P(zeroton) = %g P(singleton) = %g\n",
00220                n_zerotons,prob_zeroton,prob_singleton);
00221     if (prob_zeroton > ng->zeroton_fraction*prob_singleton) {
00222       prob_zeroton = ng->zeroton_fraction*prob_singleton;
00223       pc_message(verbosity,1,"P(zeroton) was reduced to %.10f (%.3f of P(singleton))\n",prob_zeroton,ng->zeroton_fraction);
00224     }
00225 
00226     for (i=ng->first_id;i<=ng->vocab_size;i++) {
00227       if ((return_count(ng->four_byte_counts,
00228                         ng->count_table[0],
00229                         ng->count[0],
00230                         ng->count4[0],
00231                         i) == 0) && (!ng->context_cue[i])) {
00232         ng->uni_probs[i] = prob_zeroton;
00233       }
00234     }
00235 
00236     total_zeroton_mass = n_zerotons * prob_zeroton;
00237     leftover_mass = discount_mass - total_zeroton_mass;
00238   }
00239 
00240   /* Do renormalisation due to UNK */
00241  
00242   if (ng->vocab_type == OPEN_VOCAB_2) {
00243     ng->uni_probs[0] += leftover_mass;
00244     if (ng->uni_probs[0] <= 0.0) {
00245       ng->uni_probs[0] = 1e-99;
00246     }
00247   }
00248   else {
00249     if (fabs(leftover_mass) > 1e-10) {
00250       for (i=ng->first_id;i<=ng->vocab_size;i++) {
00251         ng->uni_probs[i] /= (1.0 - leftover_mass);
00252       }
00253       if (fabs(leftover_mass)>1e-8) {
00254         pc_message(verbosity,1,"Unigram was renormalized to absorb a mass of %g\n",leftover_mass);
00255       }
00256     }
00257   }
00258   pc_message(verbosity,1,"prob[UNK] = %g\n",ng->uni_probs[0]);
00259   if ((n_zerotons>0) && (discount_mass<=0.0)) {
00260     pc_message(verbosity,1,"WARNING: %d non-context-cue words have zero probability\n\n",n_zerotons);
00261   }
00262   if (verbosity>=4) {
00263     fprintf(stderr,"THE FINAL UNIGRAM:\n");
00264     for (i=ng->first_id;i<=ng->vocab_size;i++) {
00265       fprintf(stderr," unigram[%d]=%g\n",i,ng->uni_probs[i]);
00266     }
00267   }
00268 
00269   /* Test resulting unigram for consistency */
00270 
00271   total_prob = 0.0;
00272   for (i=ng->first_id;i<=ng->vocab_size;i++) {
00273     total_prob += ng->uni_probs[i];
00274   }
00275   if (fabs(1.0-total_prob) > 1e-6) {
00276     quit(-1,"ERROR: sum[P(w)] = %.10f\n",total_prob);
00277   }
00278   if (fabs(1.0-total_prob) > 1e-9) {
00279     pc_message(verbosity,1,"WARNING: sum[P(w)] = %.10f\n\n",total_prob);
00280   }
00281 
00282   /* Precompute logprobs */
00283 
00284   for (i=ng->first_id;i<=ng->vocab_size;i++) {
00285     ng->uni_log_probs[i] = log(ng->uni_probs[i]);
00286   }
00287 
00288 }