Main Page   Compound List   File List   Compound Members   File Members  

idngram2stats.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #include <stdlib.h>
00021 #include <stdio.h>
00022 #include "pc_libs/pc_general.h"
00023 #include "toolkit.h"
00024 #include "rr_libs/general.h"
00025 
00026 typedef unsigned short id__t;
00027 
00028 typedef struct {
00029   unsigned short n;
00030   id__t          *id_array;
00031   int            count;
00032 } ngram;
00033 
00034 
00038 int get_ngram(FILE *id_ngram_fp, ngram *ng, flag ascii);
00039 
00040 void main (int argc, char **argv) {
00041 
00042   flag first_ngram;
00043   int n;
00044   int fof_size;
00045   flag is_ascii;
00046   int verbosity;
00047   int **fof_array;
00048   int *num_kgrams;
00049   ngram current_ngram;
00050   ngram previous_ngram;
00051   int *ng_count;
00052   int pos_of_novelty;
00053   int nlines;
00054   int i;
00055   int j;
00056   int t;
00057 
00058   pos_of_novelty = n; /* Simply for warning-free compilation */
00059 
00060   report_version(&argc,argv);
00061 
00062   if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
00063     fprintf(stderr,"indngram2stats : Report statistics for an id n-gram file.\n");
00064     fprintf(stderr,"Usage : idngram2stats [ -n 3 ] \n");
00065     fprintf(stderr,"                      [ -fof_size 50 ]\n");
00066     fprintf(stderr,"                      [ -verbosity %d ]\n",
00067             DEFAULT_VERBOSITY);
00068     fprintf(stderr,"                      [ ascii_input ] \n");
00069     fprintf(stderr,"                      < .idngram > .stats\n");
00070     exit(1);
00071   }
00072 
00073   is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
00074   n = pc_intarg(&argc, argv,"-n",3);
00075   fof_size = pc_intarg(&argc, argv,"-fof_size",50);
00076   verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00077 
00078   pc_report_unk_args(&argc,argv,verbosity);
00079 
00080   pc_message(verbosity,2,"n        = %d\n",n);
00081   pc_message(verbosity,2,"fof_size = %d\n",fof_size);
00082 
00083   current_ngram.n = n;
00084   previous_ngram.n = n;
00085   
00086   pos_of_novelty = n; /* Simply for warning-free compilation */
00087 
00088   fof_array = (int **) rr_malloc(sizeof(int *)*(n-1));
00089   for (i=0;i<=n-2;i++) {
00090     fof_array[i] = (int *) rr_calloc(fof_size+1,sizeof(int));
00091   }
00092 
00093   num_kgrams = (int *) rr_calloc(n-1,sizeof(int));
00094   ng_count = (int *) rr_calloc(n-1,sizeof(int));
00095 
00096   current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
00097   previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
00098 
00099   pc_message(verbosity,2,"Processing id n-gram file.\n");
00100   pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00101 
00102   nlines = 0;
00103   first_ngram = 1;
00104   
00105   while (!rr_feof(stdin)) {
00106     for (i=0;i<=n-1;i++) {
00107       previous_ngram.id_array[i]=current_ngram.id_array[i];
00108     }
00109 
00110     if (get_ngram(stdin,&current_ngram,is_ascii)) {
00111 
00112       nlines++;
00113       if (nlines % 20000 == 0) {
00114         if (nlines % 1000000 == 0) {
00115           pc_message(verbosity,2,".\n");
00116         }
00117         else {
00118           pc_message(verbosity,2,".");
00119         }
00120       }
00121     
00122       /* Test for where this ngram differs from last - do we have an
00123          out-of-order ngram? */
00124       
00125       pos_of_novelty = n;
00126 
00127       for (i=0;i<=n-1;i++) {
00128         if (current_ngram.id_array[i] > previous_ngram.id_array[i]) {
00129           pos_of_novelty = i;
00130           i=n;
00131         }
00132         else {
00133           if (current_ngram.id_array[i] < previous_ngram.id_array[i]) {
00134             quit(-1,"Error : n-grams are not correctly ordered.\n");
00135           }
00136         }
00137       }
00138       
00139       if (pos_of_novelty == n && nlines != 1) {
00140         quit(-1,"Error : Repeated ngram in idngram stream.\n");
00141       }
00142 
00143       /* Add new N-gram */
00144      
00145       num_kgrams[n-2]++;
00146       if (current_ngram.count <= fof_size) {
00147         fof_array[n-2][current_ngram.count]++;
00148       }
00149 
00150       if (!first_ngram) {
00151         for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00152           num_kgrams[i-1]++;
00153           if (ng_count[i-1] <= fof_size) {
00154             fof_array[i-1][ng_count[i-1]]++;
00155           }
00156           ng_count[i-1] = current_ngram.count;
00157         }
00158       }
00159       else {
00160         for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00161           ng_count[i-1] = current_ngram.count;
00162         }
00163         first_ngram = 0;
00164       }
00165         
00166       for (i=0;i<=pos_of_novelty-2;i++) {
00167         ng_count[i] += current_ngram.count;
00168       }
00169     }
00170   }
00171 
00172   /* Process last ngram */
00173 
00174   for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
00175     num_kgrams[i-1]++;
00176     if (ng_count[i-1] <= fof_size) {
00177       fof_array[i-1][ng_count[i-1]]++;
00178     }
00179     ng_count[i-1] = current_ngram.count;
00180   }
00181   
00182   for (i=0;i<=pos_of_novelty-2;i++) {
00183     ng_count[i] += current_ngram.count;
00184   }
00185   for (i=0;i<=n-2;i++) {
00186     fprintf(stderr,"\n%d-grams occurring:\tN times\t\t> N times\tSug. -spec_num value\n",i+2);
00187     fprintf(stderr,"%7d\t\t\t\t\t\t%7d\t\t%7d\n",0,num_kgrams[i],((int)(num_kgrams[i]*1.01))+10);
00188     t = num_kgrams[i];
00189     for (j=1;j<=fof_size;j++) {
00190       t -= fof_array[i][j];
00191       fprintf(stderr,"%7d\t\t\t\t%7d\t\t%7d\t\t%7d\n",j,
00192               fof_array[i][j],t,((int)(t*1.01))+10);
00193     }
00194   }
00195 
00196   pc_message(verbosity,0,"idngram2stats : Done.\n");
00197 
00198   exit(0);
00199   
00200 }

Generated on Tue Dec 21 13:54:45 2004 by doxygen1.2.18