00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #define DEFAULT_HASH_SIZE 200000
00021 #define DEFAULT_MAX_FILES 20
00022 #define MAX_N 20
00023 #define TEMP_FILE_ROOT "text2idngram.temp."
00024
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <sys/types.h>
00029 #include <sys/utsname.h>
00030 #include <unistd.h>
00031 #include "toolkit.h"
00032 #include "rr_libs/general.h"
00033 #include "pc_libs/pc_general.h"
00034 #include "idngram.h"
00035
00041 void add_to_buffer(unsigned short word_index,
00042 int ypos,
00043 int xpos,
00044 unsigned short *buffer) {
00045
00046
00047 buffer[(n*ypos)+xpos] = word_index;
00048
00049 }
00050
00051 unsigned short buffer_contents(int ypos,
00052 int xpos,
00053 unsigned short *buffer) {
00054
00055 return (buffer[(n*ypos)+xpos]);
00056
00057 }
00058
00059
00060
00061
00062
00063
00064
00065 void main(int argc, char *argv[]) {
00066
00067 int i,j;
00068
00069 char *vocab_filename;
00070 FILE *tempfile;
00071 char tempfiles_directory[1000];
00072 int vocab_size;
00073 FILE *vocab_file;
00074
00075 int verbosity;
00076
00077 int buffer_size;
00078 int position_in_buffer;
00079 int number_of_tempfiles;
00080 int max_files;
00081 int fof_size;
00082
00083 unsigned short *buffer;
00084 unsigned short *placeholder;
00085 unsigned short *temp_ngram;
00086 int temp_count;
00087
00088 char temp_word[MAX_WORD_LENGTH];
00089 char temp_word2[MAX_WORD_LENGTH];
00090
00091 char *temp_file_root;
00092 char *temp_file_ext;
00093 char *host_name;
00094 int proc_id;
00095 struct utsname uname_info;
00096
00097 flag write_ascii;
00098
00099
00100
00101 struct hash_table vocabulary;
00102 unsigned long hash_size;
00103 unsigned long M;
00104
00105 tempfile = NULL;
00106
00107 report_version(&argc,argv);
00108
00109 verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00110
00111
00112
00113 if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00114 fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
00115 fprintf(stderr,"Usage : text2idngram -vocab .vocab \n");
00116 fprintf(stderr," [ -buffer 100 ]\n");
00117 fprintf(stderr," [ -hash %d ]\n",DEFAULT_HASH_SIZE);
00118 fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP);
00119 fprintf(stderr," [ -files %d ]\n",DEFAULT_MAX_FILES);
00120 fprintf(stderr," [ -gzip | -compress ]\n");
00121 fprintf(stderr," [ -verbosity %d ]\n",
00122 DEFAULT_VERBOSITY);
00123 fprintf(stderr," [ -n 3 ]\n");
00124 fprintf(stderr," [ -write_ascii ]\n");
00125 fprintf(stderr," [ -fof_size 10 ]\n");
00126 exit(1);
00127 }
00128
00129 pc_message(verbosity,2,"text2idngram\n");
00130
00131 n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
00132
00133 placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00134 temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00135 hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
00136 buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
00137
00138 write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
00139
00140 fof_size = pc_intarg(&argc,argv,"-fof_size",10);
00141
00142 max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
00143
00144 vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
00145
00146 if (!strcmp("",vocab_filename)) {
00147 quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
00148 }
00149
00150 strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp",
00151 DEFAULT_TEMP));
00152
00153 if (pc_flagarg(&argc,argv,"-compress")) {
00154 temp_file_ext = salloc(".Z");
00155 }
00156 else {
00157 if (pc_flagarg(&argc,argv,"-gzip")) {
00158 temp_file_ext = salloc(".gz");
00159 }
00160 else {
00161 temp_file_ext = salloc("");
00162 }
00163 }
00164
00165 uname(&uname_info);
00166
00167 host_name = salloc(uname_info.nodename);
00168
00169 proc_id = getpid();
00170
00171 sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00172
00173 temp_file_root = salloc(temp_word);
00174
00175 pc_report_unk_args(&argc,argv,verbosity);
00176
00177
00178
00179 if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
00180 strcat(tempfiles_directory,"/");
00181 }
00182
00183 pc_message(verbosity,2,"Vocab : %s\n",vocab_filename);
00184 pc_message(verbosity,2,"N-gram buffer size : %d\n",buffer_size);
00185 pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
00186 pc_message(verbosity,2,"Temp directory : %s\n",tempfiles_directory);
00187 pc_message(verbosity,2,"Max open files : %d\n",max_files);
00188 pc_message(verbosity,2,"FOF size : %d\n",fof_size);
00189 pc_message(verbosity,2,"n : %d\n",n);
00190
00191 buffer_size *= (1000000/(sizeof(unsigned short)*n));
00192
00193
00194
00195 fprintf(stderr,"Initialising hash table...\n");
00196
00197 M = nearest_prime(hash_size);
00198
00199 new_hashtable(&vocabulary,M);
00200
00201
00202
00203 vocab_size = 0;
00204
00205 vocab_file = rr_iopen(vocab_filename);
00206
00207 pc_message(verbosity,2,"Reading vocabulary...\n");
00208
00209 while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
00210 if (strncmp(temp_word,"##",2)==0) continue;
00211 sscanf (temp_word, "%s ",temp_word2);
00212
00213
00214
00215 if (index2(&vocabulary,temp_word2) != 0) {
00216 fprintf(stderr,"======================================================\n");
00217 fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
00218 fprintf(stderr,"=======================================================\n");
00219 }
00220 if (strncmp(temp_word,"#",1)==0) {
00221 fprintf(stderr,"\n\n===========================================================\n");
00222 fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00223 fprintf(stderr, ">>> %s <<<\n",temp_word);
00224 fprintf(stderr, " '%s' will be included in the vocabulary.\n",temp_word2);
00225 fprintf(stderr, " (comments must start with '##')\n");
00226 fprintf(stderr,"===========================================================\n\n");
00227 }
00228 vocab_size++;
00229 add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
00230 }
00231
00232 if (vocab_size > MAX_VOCAB_SIZE) {
00233 quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
00234 }
00235
00236 pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");
00237
00238 buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));
00239
00240 number_of_tempfiles = 0;
00241
00242
00243
00244
00245
00246 position_in_buffer = 0;
00247
00248 for (i=0;i<=n-1;i++) {
00249 get_word(stdin,temp_word);
00250 add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
00251 }
00252
00253 while (!rr_feof(stdin)) {
00254
00255
00256
00257 pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
00258 pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00259 while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
00260 position_in_buffer++;
00261 if (position_in_buffer % 20000 == 0) {
00262 if (position_in_buffer % 1000000 == 0) {
00263 pc_message(verbosity,2,".\n");
00264 }
00265 else {
00266 pc_message(verbosity,2,".");
00267 }
00268 }
00269 for (i=1;i<=n-1;i++) {
00270 add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
00271 position_in_buffer,i-1,buffer);
00272 }
00273 if (get_word(stdin,temp_word) == 1) {
00274 add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
00275 n-1,buffer);
00276 }
00277 }
00278
00279 for (i=0;i<=n-1;i++) {
00280 placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
00281 }
00282
00283
00284
00285 pc_message(verbosity,2,"\nSorting n-grams...\n");
00286
00287 qsort((void*) buffer,(size_t) position_in_buffer,
00288 n*sizeof(unsigned short),compare_ngrams);
00289
00290
00291
00292 number_of_tempfiles++;
00293
00294 sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00295 number_of_tempfiles,temp_file_ext);
00296
00297 pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
00298 temp_word);
00299
00300 tempfile = rr_oopen(temp_word);
00301
00302 for (i=0;i<=n-1;i++) {
00303 temp_ngram[i] = buffer_contents(0,i,buffer);
00304 if (temp_ngram[i] > MAX_VOCAB_SIZE) {
00305 quit(-1,"Invalid trigram in buffer.\nAborting");
00306
00307 }
00308 }
00309 temp_count = 1;
00310
00311 for (i=1;i<=position_in_buffer;i++) {
00312
00313 if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
00314 temp_count++;
00315 }
00316 else {
00317 for (j=0;j<=n-1;j++) {
00318 rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
00319 tempfile,"temporary n-gram ids");
00320 temp_ngram[j] = buffer_contents(i,j,buffer);
00321 }
00322 rr_fwrite(&temp_count,sizeof(int),1,tempfile,
00323 "temporary n-gram counts");
00324 temp_count = 1;
00325 }
00326 }
00327
00328 rr_oclose(tempfile);
00329
00330 for (i=0;i<=n-1;i++) {
00331 add_to_buffer(placeholder[i],0,i,buffer);
00332 }
00333
00334 position_in_buffer = 0;
00335
00336 }
00337
00338
00339
00340 pc_message(verbosity,2,"Merging temporary files...\n");
00341
00342 merge_tempfiles(1,
00343 number_of_tempfiles,
00344 temp_file_root,
00345 temp_file_ext,
00346 max_files,
00347 tempfiles_directory,
00348 stdout,
00349 write_ascii,
00350 fof_size);
00351
00352 pc_message(verbosity,0,"text2idngram : Done.\n");
00353
00354 exit(0);
00355
00356 }
00357