00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00061 #include <math.h>
00062 #include <stdio.h>
00063 #include "ngram.h"
00064 #include "idngram2lm.h"
00065 #include "pc_libs/pc_general.h"
00066
00067 void compute_unigram(ng_t *ng,int verbosity) {
00068
00069 int i;
00070 int count;
00071 int n_zerotons;
00072 int num_of_types;
00073 double floatN;
00074 double prob;
00075 double total_prob;
00076 double discount_mass;
00077 double total_zeroton_mass;
00078 double prob_zeroton;
00079 double prob_singleton;
00080 double leftover_mass;
00081
00082
00083
00084 if (ng->vocab_type==OPEN_VOCAB_2 && return_count(ng->four_byte_counts,
00085 ng->count_table[0],
00086 ng->count[0],
00087 ng->count4[0],
00088 0) != 0) {
00089 quit(-1,"Error : Open vocabulary type 2 requested, but there were OOVs in the \ntraining data.\n");
00090 }
00091
00092 if (ng->vocab_type == CLOSED_VOCAB) {
00093 ng->uni_probs[0] = 1e-99;
00094 }
00095
00096
00097
00098 if (ng->no_of_ccs > 0) {
00099 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00100 if (ng->context_cue[i] && return_count(ng->four_byte_counts,
00101 ng->count_table[0],
00102 ng->count[0],
00103 ng->count4[0],
00104 i) != 0) {
00105 quit(-1,"Error : Context cue word has a non zero count.\n");
00106 }
00107 }
00108 }
00109
00110
00111
00112 floatN = (double) ng->n_unigrams;
00113
00114 total_prob = 0.0;
00115
00116 num_of_types = 0;
00117
00118 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00119 if (return_count(ng->four_byte_counts,
00120 ng->count_table[0],
00121 ng->count[0],
00122 ng->count4[0],
00123 i) > 0) {
00124 num_of_types++;
00125 }
00126 }
00127
00128
00129 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00130
00131 count = return_count(ng->four_byte_counts,
00132 ng->count_table[0],
00133 ng->count[0],
00134 ng->count4[0],
00135 i);
00136 prob = count/floatN;
00137 switch (ng->discounting_method) {
00138 case GOOD_TURING:
00139 if (count > 0 && count <= ng->disc_range[0]) {
00140 prob *= ng->gt_disc_ratio[0][count];
00141 }
00142 else {
00143 if (count == 0) {
00144 prob = 1e-99;
00145 }
00146 }
00147 break;
00148 case LINEAR:
00149 if (count > 0) {
00150 prob *= ng->lin_disc_ratio[0];
00151 }
00152 else {
00153 prob = 1e-99;
00154 }
00155 break;
00156 case ABSOLUTE:
00157 if (count > 0) {
00158 prob *= (count - ng->abs_disc_const[0])/count;
00159 }
00160 else {
00161 prob = 1e-99;
00162 }
00163 break;
00164 case WITTEN_BELL:
00165 if (count > 0) {
00166 prob *= floatN/(floatN+num_of_types);
00167 }
00168 else {
00169 prob = 1e-99;
00170 }
00171 break;
00172 }
00173 pc_message(verbosity,4," prob[%d] = %.8g count = %d \n",i,prob,count);
00174 ng->uni_probs[i] = prob;
00175 total_prob += prob;
00176 }
00177
00178
00179
00180 discount_mass = 1.0 - total_prob;
00181
00182 pc_message(verbosity,2,"Unigrams's discount mass is %g (n1/N = %g)\n",
00183 discount_mass,ng->freq_of_freq[0][1]/floatN);
00184
00185 if (discount_mass < 1e-10 && discount_mass != 0.0) {
00186 discount_mass = 0.0;
00187 pc_message(verbosity,2,"Discount mass was rounded to zero.\n");
00188 }
00189
00190
00191
00192
00193 leftover_mass = discount_mass;
00194 n_zerotons = ng->freq_of_freq[0][0] - ng->no_of_ccs;
00195
00196 if ((n_zerotons > 0) && (discount_mass > 0.0)) {
00197 total_zeroton_mass = discount_mass;
00198 if (ng->vocab_type == OPEN_VOCAB_2) {
00199 total_zeroton_mass = (1.0 - ng->oov_fraction)*discount_mass;
00200 }
00201 prob_zeroton = total_zeroton_mass / n_zerotons;
00202 prob_singleton = 1 / floatN;
00203 switch (ng->discounting_method) {
00204 case GOOD_TURING:
00205 if (ng->disc_range[0] >= 1) {
00206 prob_singleton *= ng->gt_disc_ratio[0][1];
00207 }
00208 break;
00209 case LINEAR:
00210 prob_singleton *= ng->lin_disc_ratio[0];
00211 break;
00212 case ABSOLUTE:
00213 prob_singleton *= (1-ng->abs_disc_const[0]);
00214 break;
00215 case WITTEN_BELL:
00216 prob_singleton *= floatN/(floatN + num_of_types);
00217 break;
00218 }
00219 pc_message(verbosity,2,"%d zerotons, P(zeroton) = %g P(singleton) = %g\n",
00220 n_zerotons,prob_zeroton,prob_singleton);
00221 if (prob_zeroton > ng->zeroton_fraction*prob_singleton) {
00222 prob_zeroton = ng->zeroton_fraction*prob_singleton;
00223 pc_message(verbosity,1,"P(zeroton) was reduced to %.10f (%.3f of P(singleton))\n",prob_zeroton,ng->zeroton_fraction);
00224 }
00225
00226 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00227 if ((return_count(ng->four_byte_counts,
00228 ng->count_table[0],
00229 ng->count[0],
00230 ng->count4[0],
00231 i) == 0) && (!ng->context_cue[i])) {
00232 ng->uni_probs[i] = prob_zeroton;
00233 }
00234 }
00235
00236 total_zeroton_mass = n_zerotons * prob_zeroton;
00237 leftover_mass = discount_mass - total_zeroton_mass;
00238 }
00239
00240
00241
00242 if (ng->vocab_type == OPEN_VOCAB_2) {
00243 ng->uni_probs[0] += leftover_mass;
00244 if (ng->uni_probs[0] <= 0.0) {
00245 ng->uni_probs[0] = 1e-99;
00246 }
00247 }
00248 else {
00249 if (fabs(leftover_mass) > 1e-10) {
00250 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00251 ng->uni_probs[i] /= (1.0 - leftover_mass);
00252 }
00253 if (fabs(leftover_mass)>1e-8) {
00254 pc_message(verbosity,1,"Unigram was renormalized to absorb a mass of %g\n",leftover_mass);
00255 }
00256 }
00257 }
00258 pc_message(verbosity,1,"prob[UNK] = %g\n",ng->uni_probs[0]);
00259 if ((n_zerotons>0) && (discount_mass<=0.0)) {
00260 pc_message(verbosity,1,"WARNING: %d non-context-cue words have zero probability\n\n",n_zerotons);
00261 }
00262 if (verbosity>=4) {
00263 fprintf(stderr,"THE FINAL UNIGRAM:\n");
00264 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00265 fprintf(stderr," unigram[%d]=%g\n",i,ng->uni_probs[i]);
00266 }
00267 }
00268
00269
00270
00271 total_prob = 0.0;
00272 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00273 total_prob += ng->uni_probs[i];
00274 }
00275 if (fabs(1.0-total_prob) > 1e-6) {
00276 quit(-1,"ERROR: sum[P(w)] = %.10f\n",total_prob);
00277 }
00278 if (fabs(1.0-total_prob) > 1e-9) {
00279 pc_message(verbosity,1,"WARNING: sum[P(w)] = %.10f\n\n",total_prob);
00280 }
00281
00282
00283
00284 for (i=ng->first_id;i<=ng->vocab_size;i++) {
00285 ng->uni_log_probs[i] = log(ng->uni_probs[i]);
00286 }
00287
00288 }