00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <stdlib.h>
00021 #include <stdio.h>
00022 #include "pc_libs/pc_general.h"
00023 #include "toolkit.h"
00024 #include "rr_libs/general.h"
00025
00026 typedef unsigned short id__t;
00027
00028 typedef struct {
00029 unsigned short n;
00030 id__t *id_array;
00031 int count;
00032 } ngram;
00033
00034
00038 int get_ngram(FILE *id_ngram_fp, ngram *ng, flag ascii);
00039
00040 void main (int argc, char **argv) {
00041
00042 flag first_ngram;
00043 int n;
00044 int fof_size;
00045 flag is_ascii;
00046 int verbosity;
00047 int **fof_array;
00048 int *num_kgrams;
00049 ngram current_ngram;
00050 ngram previous_ngram;
00051 int *ng_count;
00052 int pos_of_novelty;
00053 int nlines;
00054 int i;
00055 int j;
00056 int t;
00057
00058 pos_of_novelty = n;
00059
00060 report_version(&argc,argv);
00061
00062 if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
00063 fprintf(stderr,"indngram2stats : Report statistics for an id n-gram file.\n");
00064 fprintf(stderr,"Usage : idngram2stats [ -n 3 ] \n");
00065 fprintf(stderr," [ -fof_size 50 ]\n");
00066 fprintf(stderr," [ -verbosity %d ]\n",
00067 DEFAULT_VERBOSITY);
00068 fprintf(stderr," [ ascii_input ] \n");
00069 fprintf(stderr," < .idngram > .stats\n");
00070 exit(1);
00071 }
00072
00073 is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
00074 n = pc_intarg(&argc, argv,"-n",3);
00075 fof_size = pc_intarg(&argc, argv,"-fof_size",50);
00076 verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00077
00078 pc_report_unk_args(&argc,argv,verbosity);
00079
00080 pc_message(verbosity,2,"n = %d\n",n);
00081 pc_message(verbosity,2,"fof_size = %d\n",fof_size);
00082
00083 current_ngram.n = n;
00084 previous_ngram.n = n;
00085
00086 pos_of_novelty = n;
00087
00088 fof_array = (int **) rr_malloc(sizeof(int *)*(n-1));
00089 for (i=0;i<=n-2;i++) {
00090 fof_array[i] = (int *) rr_calloc(fof_size+1,sizeof(int));
00091 }
00092
00093 num_kgrams = (int *) rr_calloc(n-1,sizeof(int));
00094 ng_count = (int *) rr_calloc(n-1,sizeof(int));
00095
00096 current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
00097 previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
00098
00099 pc_message(verbosity,2,"Processing id n-gram file.\n");
00100 pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00101
00102 nlines = 0;
00103 first_ngram = 1;
00104
00105 while (!rr_feof(stdin)) {
00106 for (i=0;i<=n-1;i++) {
00107 previous_ngram.id_array[i]=current_ngram.id_array[i];
00108 }
00109
00110 if (get_ngram(stdin,¤t_ngram,is_ascii)) {
00111
00112 nlines++;
00113 if (nlines % 20000 == 0) {
00114 if (nlines % 1000000 == 0) {
00115 pc_message(verbosity,2,".\n");
00116 }
00117 else {
00118 pc_message(verbosity,2,".");
00119 }
00120 }
00121
00122
00123
00124
00125 pos_of_novelty = n;
00126
00127 for (i=0;i<=n-1;i++) {
00128 if (current_ngram.id_array[i] > previous_ngram.id_array[i]) {
00129 pos_of_novelty = i;
00130 i=n;
00131 }
00132 else {
00133 if (current_ngram.id_array[i] < previous_ngram.id_array[i]) {
00134 quit(-1,"Error : n-grams are not correctly ordered.\n");
00135 }
00136 }
00137 }
00138
00139 if (pos_of_novelty == n && nlines != 1) {
00140 quit(-1,"Error : Repeated ngram in idngram stream.\n");
00141 }
00142
00143
00144
00145 num_kgrams[n-2]++;
00146 if (current_ngram.count <= fof_size) {
00147 fof_array[n-2][current_ngram.count]++;
00148 }
00149
00150 if (!first_ngram) {
00151 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00152 num_kgrams[i-1]++;
00153 if (ng_count[i-1] <= fof_size) {
00154 fof_array[i-1][ng_count[i-1]]++;
00155 }
00156 ng_count[i-1] = current_ngram.count;
00157 }
00158 }
00159 else {
00160 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00161 ng_count[i-1] = current_ngram.count;
00162 }
00163 first_ngram = 0;
00164 }
00165
00166 for (i=0;i<=pos_of_novelty-2;i++) {
00167 ng_count[i] += current_ngram.count;
00168 }
00169 }
00170 }
00171
00172
00173
00174 for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00175 num_kgrams[i-1]++;
00176 if (ng_count[i-1] <= fof_size) {
00177 fof_array[i-1][ng_count[i-1]]++;
00178 }
00179 ng_count[i-1] = current_ngram.count;
00180 }
00181
00182 for (i=0;i<=pos_of_novelty-2;i++) {
00183 ng_count[i] += current_ngram.count;
00184 }
00185 for (i=0;i<=n-2;i++) {
00186 fprintf(stderr,"\n%d-grams occurring:\tN times\t\t> N times\tSug. -spec_num value\n",i+2);
00187 fprintf(stderr,"%7d\t\t\t\t\t\t%7d\t\t%7d\n",0,num_kgrams[i],((int)(num_kgrams[i]*1.01))+10);
00188 t = num_kgrams[i];
00189 for (j=1;j<=fof_size;j++) {
00190 t -= fof_array[i][j];
00191 fprintf(stderr,"%7d\t\t\t\t%7d\t\t%7d\t\t%7d\n",j,
00192 fof_array[i][j],t,((int)(t*1.01))+10);
00193 }
00194 }
00195
00196 pc_message(verbosity,0,"idngram2stats : Done.\n");
00197
00198 exit(0);
00199
00200 }