Main Page   Compound List   File List   Compound Members   File Members  

mergeidngram.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #include <stdio.h>
00021 #include <stdlib.h>
00022 #include "pc_libs/pc_general.h"
00023 #include "toolkit.h"
00024 #include "rr_libs/general.h"
00025 
00026 typedef unsigned short id__t;
00027 
00028 
00035 typedef struct {
00036   unsigned short n;
00037   id__t          *id_array;
00038   int            count;
00039 } ngram;
00040 
00041 int n;
00042 flag ascii_in;
00043 flag ascii_out;
00044 
00045 void updateArgs( int *pargc, char **argv, int rm_cnt );
00046 void procComLine( int *argc, char **argv );
00047 void printUsage( char *name );
00048 int cmp_ngram( ngram *ng1, ngram *ng2 );
00049 extern int get_ngram(FILE *id_ngram_fp, ngram *ng, flag ascii);
00050 
00051 /* write ngram in either ascii or binary */
00052 void write_ngram( FILE *id_ngram_fp, ngram *ng, flag ascii )
00053 {
00054   int i;
00055   
00056   if ( ascii ) {
00057     for( i = 0; i < n; i++ ) {
00058       if ( fprintf( stdout, "%hu ", ng->id_array[i] ) < 0 ) {
00059         quit( -1, "error writing ascii ngram\n" );
00060       }
00061     }
00062     if ( fprintf( stdout, "%d\n", ng->count ) < 0 ) {
00063       quit( -1, "error writing ascii ngram\n" );
00064     }
00065   }
00066   else {
00067 
00068     for ( i = 0; i < n; i++ ) {
00069       rr_fwrite( &ng->id_array[i], sizeof( id__t ), 1, id_ngram_fp,
00070                  "binary ngram" );
00071     }
00072     rr_fwrite( &ng->count, sizeof( int ), 1, id_ngram_fp,
00073                "binary ngram" );
00074   }
00075 }
00076 
00077 /* update command line argument sequence */
00078 void updateArgs( int *pargc, char **argv, int rm_cnt )
00079 {
00080   int i ;             
00081 
00082   /* update the argument count */
00083   (*pargc)-- ;
00084 
00085   /* update the command line */
00086   for( i = rm_cnt ; i < *pargc ; i++ ) argv[i] = argv[i+1] ;
00087 }       
00088 
00089 /* process the command line */
00090 void procComLine( int *argc, char **argv ) 
00091 {
00092   int i;
00093 
00094   n = 3;
00095   ascii_in = 0;
00096   ascii_out = 0;
00097 
00098   i = *argc - 1 ;
00099   while( i > 0 ) {
00100 
00101     /* handle a request for help */
00102     if( !strcmp( argv[i], "-h" ) || !strcmp( argv[i], "-help" ) ) {
00103       printUsage( argv[0] ) ;
00104       exit( 1 ) ;
00105     }
00106 
00107     /* specify n */
00108     if( !strcmp( argv[i], "-n" ) ) {
00109       n = atoi( argv[i+1] ) ;
00110       updateArgs( argc, argv, i+1 ) ;
00111       updateArgs( argc, argv, i ) ;
00112     }
00113     
00114     /* input files in ascii */
00115     if( !strcmp( argv[i], "-ascii_input" ) ) {
00116       ascii_in = 1;
00117       updateArgs( argc, argv, i ) ;
00118     }
00119 
00120     /* input files in ascii */
00121     if( !strcmp( argv[i], "-ascii_output" ) ) {
00122       ascii_out = 1;
00123       updateArgs( argc, argv, i ) ;
00124     }
00125 
00126     i--;
00127   }
00128 }
00129    
00130 /* show command line usage */ 
00131 void printUsage( char *name )
00132 {
00133   fprintf( stderr, "%s: merge idngram files.\n", name );
00134   fprintf( stderr, "Usage:\n%s [options] .idngram_1 ... .idngram_N > .idngram\n", name );
00135   fprintf( stderr, "  -n 3           \tn in n-gram \n" );
00136   fprintf( stderr, "  -ascii_input   \tinput files are ascii\n" );
00137   fprintf( stderr, "  -ascii_output  \toutput files are ascii\n" );
00138   exit(1);
00139 }
00140 
00141 /* compare two ngrams */
00142 int cmp_ngram( ngram *ng1, ngram *ng2 )
00143 {
00144   int i;
00145 
00146   if ( ng1->n != ng2->n ) {
00147     quit( -1, "Error: n-grams have different n!\n" );
00148   }
00149 
00150   for( i = 0; i < ng1->n; i++ ) {
00151     if ( ng1->id_array[i] < ng2->id_array[i] ) return( -1 );
00152     if ( ng1->id_array[i] > ng2->id_array[i] ) return( 1 );
00153   }
00154   return( 0 );
00155 }
00156     
00157 int main( int argc, char **argv )
00158 {
00159   FILE **fin;
00160   ngram *ng;
00161   ngram outng;
00162   flag *done, finished;
00163   int i, j, nfiles;
00164 
00165   /* Process the command line */
00166   report_version(&argc,argv);
00167   procComLine( &argc, argv ) ;
00168   if( argc < 2 ) {
00169     printUsage( argv[0] ) ;
00170     exit( 1 ) ;
00171   }
00172   nfiles = argc - 1;
00173 
00174   /* allocate memory */
00175   fin = (FILE **) rr_malloc( sizeof( FILE *) * nfiles );
00176   done = (flag *) rr_malloc( sizeof( flag ) * nfiles );
00177   ng = (ngram *) rr_malloc( sizeof( ngram ) * nfiles );
00178   for( i = 0; i < nfiles; i++ ) {
00179     ng[i].id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
00180     ng[i].n = n;
00181   }
00182   outng.id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
00183   outng.n = n;
00184 
00185   /* open the input files */
00186   for( i = 0; i < nfiles; i++ ) {
00187     fin[i] = rr_iopen( argv[i+1] );
00188   }
00189 
00190   /* read first ngram from each file */
00191   for( i = 0; i < nfiles; i++ ) {
00192     done[i] = 0;
00193     if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
00194       done[i] = 1;
00195     }
00196   }
00197 
00198   finished = 0;
00199   while ( !finished ) {
00200 
00201   /* set outng to max possible */
00202   for( i = 0; i < n; i++ )
00203     outng.id_array[i] = MAX_VOCAB_SIZE;
00204     
00205     /* find smallest ngram */
00206     for( i = 0; i < nfiles; i++ ) {
00207       if ( !done[i] ) {
00208         if ( cmp_ngram( &outng, &ng[i] ) > 0 ) {
00209           for( j = 0; j < n; j++ ) outng.id_array[j] = ng[i].id_array[j];
00210         }
00211       }
00212     }
00213     
00214     outng.count = 0;
00215     for( i = 0; i < nfiles; i++ ) {
00216       if ( !done[i] ) {
00217         /* add counts of equal ngrams */
00218         if ( cmp_ngram( &outng, &ng[i] ) == 0 ) {
00219           outng.count += ng[i].count;
00220           if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
00221             /* check if all files done */
00222             done[i] = 1;
00223             finished = 1;
00224             for( j = 0; j < nfiles; j++ ) {
00225               if ( ! done[j] ) finished = 0;
00226             }
00227           }
00228         }
00229       }
00230     }
00231 
00232     write_ngram( stdout, &outng, ascii_out );
00233 
00234   }
00235   for( i = 0; i < nfiles; i++ )
00236     rr_iclose( fin[i] );
00237 
00238   fprintf(stderr,"mergeidngram : Done.\n");
00239 
00240   return( 0 );
00241 }
00242 

Generated on Tue Dec 21 13:54:45 2004 by doxygen1.2.18