00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include <stdio.h>
00021 #include <stdlib.h>
00022 #include "pc_libs/pc_general.h"
00023 #include "toolkit.h"
00024 #include "rr_libs/general.h"
00025
00026 typedef unsigned short id__t;
00027
00028
00035 typedef struct {
00036 unsigned short n;
00037 id__t *id_array;
00038 int count;
00039 } ngram;
00040
00041 int n;
00042 flag ascii_in;
00043 flag ascii_out;
00044
00045 void updateArgs( int *pargc, char **argv, int rm_cnt );
00046 void procComLine( int *argc, char **argv );
00047 void printUsage( char *name );
00048 int cmp_ngram( ngram *ng1, ngram *ng2 );
00049 extern int get_ngram(FILE *id_ngram_fp, ngram *ng, flag ascii);
00050
00051
00052 void write_ngram( FILE *id_ngram_fp, ngram *ng, flag ascii )
00053 {
00054 int i;
00055
00056 if ( ascii ) {
00057 for( i = 0; i < n; i++ ) {
00058 if ( fprintf( stdout, "%hu ", ng->id_array[i] ) < 0 ) {
00059 quit( -1, "error writing ascii ngram\n" );
00060 }
00061 }
00062 if ( fprintf( stdout, "%d\n", ng->count ) < 0 ) {
00063 quit( -1, "error writing ascii ngram\n" );
00064 }
00065 }
00066 else {
00067
00068 for ( i = 0; i < n; i++ ) {
00069 rr_fwrite( &ng->id_array[i], sizeof( id__t ), 1, id_ngram_fp,
00070 "binary ngram" );
00071 }
00072 rr_fwrite( &ng->count, sizeof( int ), 1, id_ngram_fp,
00073 "binary ngram" );
00074 }
00075 }
00076
00077
00078 void updateArgs( int *pargc, char **argv, int rm_cnt )
00079 {
00080 int i ;
00081
00082
00083 (*pargc)-- ;
00084
00085
00086 for( i = rm_cnt ; i < *pargc ; i++ ) argv[i] = argv[i+1] ;
00087 }
00088
00089
00090 void procComLine( int *argc, char **argv )
00091 {
00092 int i;
00093
00094 n = 3;
00095 ascii_in = 0;
00096 ascii_out = 0;
00097
00098 i = *argc - 1 ;
00099 while( i > 0 ) {
00100
00101
00102 if( !strcmp( argv[i], "-h" ) || !strcmp( argv[i], "-help" ) ) {
00103 printUsage( argv[0] ) ;
00104 exit( 1 ) ;
00105 }
00106
00107
00108 if( !strcmp( argv[i], "-n" ) ) {
00109 n = atoi( argv[i+1] ) ;
00110 updateArgs( argc, argv, i+1 ) ;
00111 updateArgs( argc, argv, i ) ;
00112 }
00113
00114
00115 if( !strcmp( argv[i], "-ascii_input" ) ) {
00116 ascii_in = 1;
00117 updateArgs( argc, argv, i ) ;
00118 }
00119
00120
00121 if( !strcmp( argv[i], "-ascii_output" ) ) {
00122 ascii_out = 1;
00123 updateArgs( argc, argv, i ) ;
00124 }
00125
00126 i--;
00127 }
00128 }
00129
00130
00131 void printUsage( char *name )
00132 {
00133 fprintf( stderr, "%s: merge idngram files.\n", name );
00134 fprintf( stderr, "Usage:\n%s [options] .idngram_1 ... .idngram_N > .idngram\n", name );
00135 fprintf( stderr, " -n 3 \tn in n-gram \n" );
00136 fprintf( stderr, " -ascii_input \tinput files are ascii\n" );
00137 fprintf( stderr, " -ascii_output \toutput files are ascii\n" );
00138 exit(1);
00139 }
00140
00141
00142 int cmp_ngram( ngram *ng1, ngram *ng2 )
00143 {
00144 int i;
00145
00146 if ( ng1->n != ng2->n ) {
00147 quit( -1, "Error: n-grams have different n!\n" );
00148 }
00149
00150 for( i = 0; i < ng1->n; i++ ) {
00151 if ( ng1->id_array[i] < ng2->id_array[i] ) return( -1 );
00152 if ( ng1->id_array[i] > ng2->id_array[i] ) return( 1 );
00153 }
00154 return( 0 );
00155 }
00156
00157 int main( int argc, char **argv )
00158 {
00159 FILE **fin;
00160 ngram *ng;
00161 ngram outng;
00162 flag *done, finished;
00163 int i, j, nfiles;
00164
00165
00166 report_version(&argc,argv);
00167 procComLine( &argc, argv ) ;
00168 if( argc < 2 ) {
00169 printUsage( argv[0] ) ;
00170 exit( 1 ) ;
00171 }
00172 nfiles = argc - 1;
00173
00174
00175 fin = (FILE **) rr_malloc( sizeof( FILE *) * nfiles );
00176 done = (flag *) rr_malloc( sizeof( flag ) * nfiles );
00177 ng = (ngram *) rr_malloc( sizeof( ngram ) * nfiles );
00178 for( i = 0; i < nfiles; i++ ) {
00179 ng[i].id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
00180 ng[i].n = n;
00181 }
00182 outng.id_array = (id__t *) rr_calloc( n, sizeof( id__t ) );
00183 outng.n = n;
00184
00185
00186 for( i = 0; i < nfiles; i++ ) {
00187 fin[i] = rr_iopen( argv[i+1] );
00188 }
00189
00190
00191 for( i = 0; i < nfiles; i++ ) {
00192 done[i] = 0;
00193 if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
00194 done[i] = 1;
00195 }
00196 }
00197
00198 finished = 0;
00199 while ( !finished ) {
00200
00201
00202 for( i = 0; i < n; i++ )
00203 outng.id_array[i] = MAX_VOCAB_SIZE;
00204
00205
00206 for( i = 0; i < nfiles; i++ ) {
00207 if ( !done[i] ) {
00208 if ( cmp_ngram( &outng, &ng[i] ) > 0 ) {
00209 for( j = 0; j < n; j++ ) outng.id_array[j] = ng[i].id_array[j];
00210 }
00211 }
00212 }
00213
00214 outng.count = 0;
00215 for( i = 0; i < nfiles; i++ ) {
00216 if ( !done[i] ) {
00217
00218 if ( cmp_ngram( &outng, &ng[i] ) == 0 ) {
00219 outng.count += ng[i].count;
00220 if ( !get_ngram( fin[i], &ng[i], ascii_in ) ) {
00221
00222 done[i] = 1;
00223 finished = 1;
00224 for( j = 0; j < nfiles; j++ ) {
00225 if ( ! done[j] ) finished = 0;
00226 }
00227 }
00228 }
00229 }
00230 }
00231
00232 write_ngram( stdout, &outng, ascii_out );
00233
00234 }
00235 for( i = 0; i < nfiles; i++ )
00236 rr_iclose( fin[i] );
00237
00238 fprintf(stderr,"mergeidngram : Done.\n");
00239
00240 return( 0 );
00241 }
00242