00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "rr_libs/general.h"
00024 #include "ngram.h"
00025 #include "pc_libs/pc_general.h"
00026
00031 void compute_gt_discount(int n,
00032 int *freq_of_freq,
00033 int fof_size,
00034 unsigned short *disc_range,
00035 int cutoff,
00036 int verbosity,
00037 disc_val_t **discounted_values) {
00038
00039
00040
00041
00042 flag done;
00043 int r;
00044 int K;
00045 double common_term;
00046 double first_term;
00047 double *D;
00048
00049 D = (double *) rr_calloc((*disc_range)+1,sizeof(double));
00050 *discounted_values = D;
00051
00052
00053
00054 if (fof_size == 0) {
00055 return;
00056 }
00057
00058 if (freq_of_freq[1] == 0) {
00059 pc_message(verbosity,2,"Warning : %d-gram : f-of-f[1] = 0 --> %d-gram discounting is disabled.\n",n,n);
00060 *disc_range=0;
00061 return;
00062 }
00063
00064 if (*disc_range + 1 > fof_size) {
00065 pc_message(verbosity,2,"Warning : %d-gram : max. recorded f-o-f is only %d\n",n,fof_size);
00066 pc_message(verbosity,2,"%d-gram discounting range is reset to %d.\n",fof_size,n,fof_size-1);
00067 *disc_range = fof_size-1;
00068 }
00069
00070 done = 0;
00071
00072 while (!done) {
00073 if (*disc_range == 0) {
00074 pc_message(verbosity,2,"Warning : %d-gram : Discounting is disabled.\n",n);
00075 return;
00076 }
00077
00078 if (*disc_range == 1) {
00079
00080 if ((n==1) && freq_of_freq[0]>0) {
00081 D[1] = freq_of_freq[1] / ((float) (freq_of_freq[1] + freq_of_freq[0]));
00082 pc_message(verbosity,2,"Warning : %d-gram : Discounting range is 1; setting P(zeroton)=P(singleton).\nDiscounted value : %.2f\n",n,D[1]);
00083 return;
00084 }
00085 else {
00086 pc_message(verbosity,2,"Warning : %d-gram : Discounting range of 1 is equivalent to excluding \nsingletons.\n",n);
00087
00088 }
00089 }
00090
00091 K = *disc_range;
00092 common_term = ((double) (K+1) * freq_of_freq[K+1]) / freq_of_freq[1];
00093 if (common_term<=0.0 || common_term>=1.0) {
00094 pc_message(verbosity,2,"Warning : %d-gram : GT statistics are out of range; lowering cutoff to %d.\n",n,K-1);
00095 (*disc_range)--;
00096 }
00097 else {
00098 for (r=1;r<=K;r++) {
00099 first_term = ((double) ((r+1) * freq_of_freq[r+1]))
00100 / (r * freq_of_freq[r]);
00101 D[r]=(first_term - common_term)/(1.0 - common_term);
00102 }
00103 pc_message(verbosity,3,"%d-gram : cutoff = %d, discounted values:",n,K);
00104 for (r=1;r<=K;r++) {
00105 pc_message(verbosity,3," %.2f",D[r]);
00106 }
00107 pc_message(verbosity,3,"\n");
00108 done = 1;
00109 for (r=1; r<=K; r++) {
00110 if (D[r]<0 || D[r]>1.0) {
00111 pc_message(verbosity,2,"Warning : %d-gram : Some discount values are out of range;\nlowering discounting range to %d.\n",n,K-1);
00112 (*disc_range)--;
00113 r=K+1;
00114 done = 0;
00115 }
00116 }
00117 }
00118 }
00119
00120 for (r=1; r<=MIN(cutoff,K); r++) D[r] = 0.0;
00121
00122 }
00123