00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #define DEFAULT_MAX_FILES 20
00021 #define TEMP_FILE_ROOT "text2wngram.tmp."
00022
00023 #include <sys/types.h>
00024 #include <unistd.h>
00025 #include <sys/utsname.h>
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <stdlib.h>
00029 #include "toolkit.h"
00030 #include "pc_libs/pc_general.h"
00031 #include "rr_libs/general.h"
00032
00036 int cmp_strings(const void *string1,const void *string2) {
00037
00038 char *s1;
00039 char *s2;
00040
00041 s1 = *((char **) string1);
00042 s2 = *((char **) string2);
00043
00044 return (strcmp(s1,s2));
00045
00046 }
00047
00048 void merge_tempfiles (int start_file,
00049 int end_file,
00050 char *temp_file_root,
00051 char *temp_file_ext,
00052 int max_files,
00053 char *tempfiles_directory,
00054 FILE *outfile,
00055 int n,
00056 int verbosity) {
00057
00058
00059 FILE *new_temp_file;
00060 char *new_temp_filename;
00061
00062 FILE **temp_file;
00063 char **temp_filename;
00064 char **current_ngram;
00065 char smallest_ngram[1000];
00066 int *current_ngram_count;
00067 flag *finished;
00068 flag all_finished;
00069 int temp_count;
00070 char temp_word[500];
00071 int i,j;
00072
00073 pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file,
00074 end_file);
00075
00076
00077
00078
00079 if (end_file-start_file+1 > max_files) {
00080 int new_start_file, new_end_file;
00081 int n_file_groups = 1 + (end_file-start_file)/max_files;
00082
00083 fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file,
00084 n_file_groups);
00085
00086 new_temp_filename = (char *) rr_malloc(300*sizeof(char));
00087
00088
00089
00090
00091
00092
00093
00094
00095 for (i = 0; i < n_file_groups; i++) {
00096
00097 new_start_file = start_file + (i*max_files);
00098 new_end_file = start_file + ((i+1)*max_files) - 1;
00099 if (new_end_file > end_file) new_end_file = end_file;
00100
00101 sprintf(new_temp_filename,
00102 "%s%s%hu%s",
00103 tempfiles_directory,
00104 temp_file_root,
00105 end_file+i+1,
00106 temp_file_ext);
00107
00108 new_temp_file = rr_oopen(new_temp_filename);
00109
00110 merge_tempfiles(new_start_file,
00111 new_end_file,
00112 temp_file_root,
00113 temp_file_ext,
00114 max_files,
00115 tempfiles_directory,
00116 new_temp_file,
00117 n,
00118 verbosity);
00119
00120 rr_iclose(new_temp_file);
00121
00122 }
00123
00124 merge_tempfiles(end_file+1,
00125 end_file+n_file_groups,
00126 temp_file_root,
00127 temp_file_ext,
00128 max_files,
00129 tempfiles_directory,
00130 outfile,
00131 n,
00132 verbosity);
00133
00134 return;
00135 }
00136
00137
00138
00139
00140
00141 temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *));
00142 temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *));
00143 for (i=start_file;i<=end_file;i++) {
00144 temp_filename[i] = (char *) rr_malloc(300*sizeof(char));
00145 }
00146 current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *));
00147 for (i=start_file;i<=end_file;i++) {
00148 current_ngram[i] = (char *) rr_malloc(1000*sizeof(char));
00149 }
00150 current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int));
00151 finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1));
00152
00153
00154 for (i=start_file;i<=end_file;i++) {
00155 sprintf(temp_filename[i],"%s%s%hu%s",tempfiles_directory,
00156 temp_file_root,i,temp_file_ext);
00157 temp_file[i] = rr_iopen(temp_filename[i]);
00158 }
00159
00160
00161
00162
00163 for (i=start_file;i<=end_file;i++) {
00164 finished[i] = 0;
00165 if (!rr_feof(temp_file[i])) {
00166 for (j=0;j<=n-1;j++) {
00167 if (fscanf(temp_file[i],"%s",temp_word) != 1) {
00168 if (!rr_feof(temp_file[i])) {
00169 quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00170 }
00171 }
00172 else {
00173 if (j==0) {
00174 strcpy(current_ngram[i],temp_word);
00175 }
00176 else {
00177 strcat(current_ngram[i]," ");
00178 strcat(current_ngram[i],temp_word);
00179 }
00180 }
00181 }
00182 if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) {
00183 if (!rr_feof(temp_file[i])) {
00184 quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00185 }
00186 }
00187 }
00188 }
00189
00190 all_finished = 0;
00191
00192 while (!all_finished) {
00193
00194
00195
00196 strcpy(smallest_ngram,"");
00197
00198 for (i=start_file;i<=end_file;i++) {
00199 if (!finished[i]) {
00200 if (strcmp(smallest_ngram,current_ngram[i]) > 0 ||
00201 (smallest_ngram[0] == '\0')) {
00202 strcpy(smallest_ngram,current_ngram[i]);
00203 }
00204 }
00205 }
00206
00207
00208
00209
00210
00211 temp_count = 0;
00212
00213 for (i=start_file;i<=end_file;i++) {
00214 if (!finished[i]) {
00215 if (!strcmp(smallest_ngram,current_ngram[i])) {
00216 temp_count += current_ngram_count[i];
00217 if (!rr_feof(temp_file[i])) {
00218 for (j=0;j<=n-1;j++) {
00219 if (fscanf(temp_file[i],"%s",temp_word) != 1) {
00220 if (!rr_feof(temp_file[i])) {
00221 quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00222 }
00223 }
00224 else {
00225 if (j==0) {
00226 strcpy(current_ngram[i],temp_word);
00227 }
00228 else {
00229 strcat(current_ngram[i]," ");
00230 strcat(current_ngram[i],temp_word);
00231 }
00232 }
00233 }
00234 if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) {
00235 if (!rr_feof(temp_file[i])) {
00236 quit(-1,"Error reading temp file count %s\n",
00237 temp_filename[i]);
00238 }
00239 }
00240 }
00241
00242
00243
00244
00245
00246
00247 if (rr_feof(temp_file[i])) {
00248 finished[i] = 1;
00249 all_finished = 1;
00250 for (j=start_file;j<=end_file;j++) {
00251 if (!finished[j]) {
00252 all_finished = 0;
00253 }
00254 }
00255 }
00256 }
00257 }
00258 }
00259
00260
00261
00262
00263
00264
00265
00266 if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) {
00267 quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00268 }
00269 }
00270
00271 for (i=start_file;i<=end_file;i++) {
00272 rr_iclose(temp_file[i]);
00273 remove(temp_filename[i]);
00274 }
00275
00276 free(temp_file);
00277 for (i=start_file;i<=end_file;i++) {
00278 free(temp_filename[i]);
00279 }
00280 free(temp_filename);
00281 for (i=start_file;i<=end_file;i++) {
00282 free(current_ngram[i]);
00283 }
00284 free(current_ngram);
00285
00286 free(current_ngram_count);
00287 free(finished);
00288 }
00289
00290
00291
00292 void main (int argc, char **argv) {
00293
00294 int n;
00295 int verbosity;
00296 int max_files;
00297 int max_words;
00298 int max_chars;
00299 char temp_directory[1000];
00300
00301 int current_word;
00302 int current_char;
00303 int start_char;
00304
00305 int no_of_spaces;
00306 int pos_in_string;
00307
00308 int i;
00309 char *current_string;
00310 char current_temp_filename[500];
00311 int current_file_number;
00312 FILE *temp_file;
00313
00314 flag text_buffer_full;
00315
00316 char *text_buffer;
00317 char **pointers;
00318
00319 char current_ngram[500];
00320 int current_count;
00321
00322 int counter;
00323
00324 struct utsname uname_info;
00325 char *temp_file_root;
00326 char *temp_file_ext;
00327 char *host_name;
00328 int proc_id;
00329 char temp_word[500];
00330
00331 flag words_set;
00332 flag chars_set;
00333
00334
00335
00336 verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00337 pc_message(verbosity,2,"text2wngram\n");
00338
00339 report_version(&argc,argv);
00340
00341 if (pc_flagarg( &argc, argv,"-help")) {
00342 fprintf(stderr,"text2wngram - Convert a text stream to a word n-gram stream.\n");
00343 fprintf(stderr,"Usage : text2wngram [ -n 3 ]\n");
00344 fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP);
00345 fprintf(stderr," [ -chars %d ]\n",STD_MEM*7000000/11);
00346 fprintf(stderr," [ -words %d ]\n",STD_MEM*1000000/11);
00347 fprintf(stderr," [ -gzip | -compress ]\n");
00348 fprintf(stderr," [ -verbosity 2 ]\n");
00349 fprintf(stderr," < .text > .wngram\n");
00350 exit(1);
00351 }
00352
00353 n = pc_intarg(&argc, argv,"-n",DEFAULT_N);
00354
00355
00356
00357
00358 max_words = pc_intarg(&argc, argv,"-words",-1);
00359 max_chars = pc_intarg(&argc, argv,"-chars",-1);
00360
00361 if (max_words == -1) {
00362 words_set = 0;
00363 max_words = STD_MEM*1000000/11;
00364 }
00365 else {
00366 words_set = 1;
00367 }
00368
00369 if (max_chars == -1) {
00370 chars_set = 0;
00371 max_chars = STD_MEM*7000000/11;
00372 }
00373 else {
00374 chars_set = 1;
00375 }
00376
00377 max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);
00378
00379 strcpy(temp_directory,pc_stringarg( &argc, argv, "-temp", DEFAULT_TEMP));
00380
00381
00382 if (pc_flagarg(&argc,argv,"-compress")) {
00383 temp_file_ext = salloc(".Z");
00384 }
00385 else {
00386 if (pc_flagarg(&argc,argv,"-gzip")) {
00387 temp_file_ext = salloc(".gz");
00388 }
00389 else {
00390 temp_file_ext = salloc("");
00391 }
00392 }
00393
00394 uname(&uname_info);
00395
00396 host_name = salloc(uname_info.nodename);
00397
00398 proc_id = getpid();
00399
00400 sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00401
00402 temp_file_root = salloc(temp_word);
00403
00404 pc_report_unk_args(&argc,argv,verbosity);
00405
00406 if (words_set && !chars_set) {
00407 max_chars = max_words * 7;
00408 }
00409
00410 if (!words_set && chars_set) {
00411 max_words = max_chars / 7;
00412 }
00413
00414
00415
00416 if (temp_directory[strlen(temp_directory)-1] != '/') {
00417 strcat(temp_directory,"/");
00418 }
00419 pc_message(verbosity,2,"n = %d\n",n);
00420 pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
00421 pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
00422 pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
00423 pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);
00424
00425
00426
00427 text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
00428 pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
00429 sizeof(char)*max_chars);
00430
00431 pointers = (char **) rr_malloc(sizeof(char *)*max_words);
00432 pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
00433 sizeof(char *)*max_words);
00434
00435 current_file_number = 0;
00436
00437 current_word = 1;
00438 start_char = 0;
00439 current_char = 0;
00440 counter = 0;
00441 pointers[0] = text_buffer;
00442
00443 while (!feof(stdin)) {
00444
00445 current_file_number++;
00446
00447
00448
00449 pc_message(verbosity,2,"Reading text into buffer...\n");
00450
00451 pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
00452 pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
00453
00454 pointers[0] = text_buffer;
00455
00456 while ((!rr_feof(stdin)) &&
00457 (current_word < max_words) &&
00458 (current_char < max_chars)) {
00459
00460 text_buffer[current_char] = getchar();
00461 if (text_buffer[current_char] == '\n' ||
00462 text_buffer[current_char] == '\t' ) {
00463 text_buffer[current_char] = ' ';
00464 }
00465 if (text_buffer[current_char] == ' ') {
00466 if (current_char > start_char) {
00467 if (text_buffer[current_char-1] == ' ') {
00468 current_word--;
00469 current_char--;
00470 }
00471 pointers[current_word] = &(text_buffer[current_char+1]);
00472 current_word++;
00473 counter++;
00474 if (counter % 20000 == 0) {
00475 if (counter % 1000000 == 0) {
00476 pc_message(verbosity,2,"\n");
00477 }
00478 else {
00479 pc_message(verbosity,2,".");
00480 }
00481 }
00482 }
00483 }
00484
00485 if (text_buffer[current_char] != ' ' ||
00486 current_char > start_char) {
00487 current_char++;
00488 }
00489 }
00490
00491 text_buffer[current_char]='\0';
00492
00493
00494 if (current_word == max_words || rr_feof(stdin)) {
00495 for (i=current_char+1;i<=max_chars-1;i++) {
00496 text_buffer[i] = ' ';
00497 }
00498 text_buffer_full = 0;
00499 }
00500 else {
00501 text_buffer_full = 1;
00502 }
00503
00504
00505
00506 pc_message(verbosity,2,"\nSorting pointer array...\n");
00507
00508 qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
00509
00510
00511
00512 sprintf(current_temp_filename,"%s%s%hu%s",temp_directory,temp_file_root,current_file_number,temp_file_ext);
00513
00514 pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
00515
00516 temp_file = rr_oopen(current_temp_filename);
00517 text_buffer[current_char] = ' ';
00518
00519 current_count = 0;
00520 strcpy(current_ngram,"");
00521
00522 for (i = 0; i <= current_word-n; i++) {
00523 current_string = pointers[i];
00524
00525
00526
00527 no_of_spaces = 0;
00528 pos_in_string = 0;
00529 while (no_of_spaces < n) {
00530
00531 if (current_string[pos_in_string] == ' ') {
00532 no_of_spaces++;
00533 }
00534 pos_in_string++;
00535 }
00536
00537 if (!strncmp(current_string,current_ngram,pos_in_string)) {
00538 current_count++;
00539 }
00540 else {
00541 if (strcmp(current_ngram,"")) {
00542 if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) {
00543 quit(-1,"Error writing to temporary file %s\n",current_temp_filename);
00544 }
00545 }
00546 current_count = 1;
00547 strncpy(current_ngram,current_string,pos_in_string);
00548 current_ngram[pos_in_string] = '\0';
00549 }
00550 }
00551
00552 rr_oclose(temp_file);
00553
00554
00555
00556
00557 strcpy(text_buffer,pointers[current_word-n]);
00558 pointers[0]=text_buffer;
00559
00560
00561
00562 no_of_spaces=0;
00563 pos_in_string=0;
00564
00565 if (!text_buffer_full){
00566 while (no_of_spaces<(n-1)) {
00567 if (pointers[0][pos_in_string]==' ') {
00568 no_of_spaces++;
00569 pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
00570 }
00571 pos_in_string++;
00572 }
00573 }
00574 else {
00575 while (no_of_spaces<n) {
00576 if (pointers[0][pos_in_string]==' ') {
00577 no_of_spaces++;
00578 pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
00579 }
00580 pos_in_string++;
00581 }
00582 pos_in_string--;
00583 }
00584
00585 current_char = pos_in_string;
00586 current_word = n;
00587
00588 start_char = current_char;
00589
00590 }
00591
00592
00593 pc_message(verbosity,2,"Merging temporary files...\n");
00594
00595 merge_tempfiles(1,
00596 current_file_number,
00597 temp_file_root,
00598 temp_file_ext,
00599 max_files,
00600 temp_directory,
00601 stdout,
00602 n,
00603 verbosity);
00604 pc_message(verbosity,0,"text2wngram : Done.\n");
00605
00606 exit(0);
00607
00608 }
00609