Main Page   Compound List   File List   Compound Members   File Members  

text2wngram.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #define DEFAULT_MAX_FILES 20
00021 #define TEMP_FILE_ROOT "text2wngram.tmp."
00022 
00023 #include <sys/types.h>
00024 #include <unistd.h>
00025 #include <sys/utsname.h>
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include <stdlib.h>
00029 #include "toolkit.h"
00030 #include "pc_libs/pc_general.h"
00031 #include "rr_libs/general.h"
00032 
00036 int cmp_strings(const void *string1,const void *string2) {
00037   
00038   char *s1;
00039   char *s2;
00040   
00041   s1 = *((char **) string1);
00042   s2 = *((char **) string2);
00043 
00044   return (strcmp(s1,s2));
00045 
00046 }
00047 
00048 void merge_tempfiles (int start_file, 
00049                       int end_file, 
00050                       char *temp_file_root,
00051                       char *temp_file_ext,
00052                       int max_files,
00053                       char *tempfiles_directory, 
00054                       FILE *outfile,
00055                       int n,
00056                       int verbosity) {
00057 
00058 
00059   FILE *new_temp_file;
00060   char *new_temp_filename;
00061   
00062   FILE **temp_file;
00063   char **temp_filename;
00064   char **current_ngram;
00065   char smallest_ngram[1000];
00066   int *current_ngram_count;
00067   flag *finished;
00068   flag all_finished;
00069   int temp_count;
00070   char temp_word[500];
00071   int i,j;
00072   
00073   pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file,
00074           end_file);
00075    /*
00076     * If we try to do more than max_files, then merge into groups,
00077     * then merge groups recursively.
00078     */
00079     if (end_file-start_file+1 > max_files) {
00080        int new_start_file, new_end_file;
00081        int n_file_groups = 1 + (end_file-start_file)/max_files;
00082  
00083        fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file,
00084               n_file_groups);
00085  
00086        new_temp_filename = (char *) rr_malloc(300*sizeof(char));
00087  
00088        /*
00089         * These n_file_groups sets of files will be done in groups of
00090         * max_files batches each, as temp files numbered
00091         * end_file+1 ... end_file+n_file_groups,
00092         * and then these will be merged into the final result.
00093         */
00094  
00095        for (i = 0; i < n_file_groups; i++) {
00096           /* do files i*max_files through min((i+1)*max_files-1,end_file); */
00097           new_start_file = start_file + (i*max_files);
00098           new_end_file = start_file + ((i+1)*max_files) - 1;
00099           if (new_end_file > end_file) new_end_file = end_file;
00100           
00101           sprintf(new_temp_filename,
00102                   "%s%s%hu%s",
00103                   tempfiles_directory,
00104                   temp_file_root,
00105                   end_file+i+1,
00106                   temp_file_ext);
00107  
00108           new_temp_file = rr_oopen(new_temp_filename);
00109  
00110           merge_tempfiles(new_start_file,
00111                           new_end_file,
00112                           temp_file_root,
00113                           temp_file_ext,
00114                           max_files,
00115                           tempfiles_directory,
00116                           new_temp_file,
00117                           n,
00118                           verbosity);
00119  
00120           rr_iclose(new_temp_file);
00121  
00122        }
00123  
00124        merge_tempfiles(end_file+1,
00125                        end_file+n_file_groups,
00126                        temp_file_root,
00127                        temp_file_ext,
00128                        max_files,
00129                        tempfiles_directory,
00130                        outfile,
00131                        n,
00132                        verbosity);
00133  
00134        return;
00135     }
00136     
00137    /*
00138     * We know we are now doing <= max_files.
00139     */
00140  
00141    temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *));
00142    temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *));
00143    for (i=start_file;i<=end_file;i++) {
00144      temp_filename[i] = (char *) rr_malloc(300*sizeof(char));
00145    }
00146    current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *));
00147    for (i=start_file;i<=end_file;i++) {
00148      current_ngram[i] = (char *) rr_malloc(1000*sizeof(char));
00149    }
00150    current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int));
00151    finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1));
00152   
00153    /* Open all the temp files for reading */
00154    for (i=start_file;i<=end_file;i++) {
00155      sprintf(temp_filename[i],"%s%s%hu%s",tempfiles_directory,
00156              temp_file_root,i,temp_file_ext);
00157      temp_file[i] = rr_iopen(temp_filename[i]);
00158    }
00159  
00160    /* Now go through the files simultaneously, and write out the appropriate
00161       ngram counts to the output file. */
00162  
00163    for (i=start_file;i<=end_file;i++) {
00164      finished[i] = 0;
00165      if (!rr_feof(temp_file[i])) {
00166        for (j=0;j<=n-1;j++) {
00167         if (fscanf(temp_file[i],"%s",temp_word) != 1) {
00168           if (!rr_feof(temp_file[i])) {
00169             quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00170           }
00171         }
00172         else {
00173           if (j==0) {
00174             strcpy(current_ngram[i],temp_word);
00175           }
00176           else {
00177             strcat(current_ngram[i]," ");
00178             strcat(current_ngram[i],temp_word);
00179           }
00180         }
00181        }
00182        if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
00183          if (!rr_feof(temp_file[i])) {
00184            quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00185          }
00186        }
00187      }
00188    }
00189    
00190    all_finished = 0;
00191    
00192    while (!all_finished) {
00193   
00194      /* Find the smallest current ngram */
00195  
00196      strcpy(smallest_ngram,"");
00197  
00198      for (i=start_file;i<=end_file;i++) {
00199        if (!finished[i]) {
00200          if (strcmp(smallest_ngram,current_ngram[i]) > 0 ||
00201              (smallest_ngram[0] == '\0')) {
00202            strcpy(smallest_ngram,current_ngram[i]);
00203          }
00204        }
00205      }
00206      
00207      /* For each of the files that are currently holding this ngram,
00208         add its count to the temporary count, and read in a new ngram
00209         from the files. */
00210   
00211      temp_count = 0;
00212  
00213      for (i=start_file;i<=end_file;i++) {
00214        if (!finished[i]) {
00215         if (!strcmp(smallest_ngram,current_ngram[i])) {
00216           temp_count += current_ngram_count[i];
00217           if (!rr_feof(temp_file[i])) {
00218             for (j=0;j<=n-1;j++) {
00219               if (fscanf(temp_file[i],"%s",temp_word) != 1) {
00220                 if (!rr_feof(temp_file[i])) {
00221                   quit(-1,"Error reading temp file %s\n",temp_filename[i]);
00222                 }
00223               }
00224               else {
00225                 if (j==0) {
00226                   strcpy(current_ngram[i],temp_word);
00227                 }
00228                 else {
00229                   strcat(current_ngram[i]," ");
00230                   strcat(current_ngram[i],temp_word);
00231                 }
00232               }
00233             }
00234             if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
00235               if (!rr_feof(temp_file[i])) {
00236                 quit(-1,"Error reading temp file count %s\n",
00237                      temp_filename[i]);
00238               }
00239             }
00240           }
00241  
00242           /*
00243            * PWP: Note that the fscanf may have changed the state of
00244            * temp_file[i], so we re-ask the question rather than just
00245            * doing an "else".
00246            */
00247           if (rr_feof(temp_file[i])) {
00248             finished[i] = 1;
00249             all_finished = 1;
00250             for (j=start_file;j<=end_file;j++) {
00251               if (!finished[j]) {
00252                 all_finished = 0;
00253               }
00254             }
00255           }
00256         }
00257         }
00258       }
00259  
00260      /*
00261       * PWP: We cannot conditionalize this on (!all_finished) because
00262       * if we do we may have lost the very last count.  (Consider the
00263       * case when several files have ran out of data, but the last
00264       * couple have the last count in them.)
00265       */
00266      if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) {
00267        quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
00268      }
00269    }
00270  
00271    for (i=start_file;i<=end_file;i++) {
00272      rr_iclose(temp_file[i]);
00273      remove(temp_filename[i]);
00274    }
00275     
00276     free(temp_file);
00277    for (i=start_file;i<=end_file;i++) {
00278       free(temp_filename[i]);
00279     }
00280     free(temp_filename);  
00281    for (i=start_file;i<=end_file;i++) {
00282       free(current_ngram[i]);
00283     }
00284     free(current_ngram);
00285 
00286   free(current_ngram_count);
00287   free(finished);
00288 }
00289 
00290 
00291 
00292 void main (int argc, char **argv) {
00293 
00294   int n;
00295   int verbosity;
00296   int max_files;
00297   int max_words;
00298   int max_chars;
00299   char temp_directory[1000];
00300 
00301   int current_word;
00302   int current_char;
00303   int start_char;               /* start boundary (possibly > than 0) */
00304 
00305   int no_of_spaces;
00306   int pos_in_string;
00307 
00308   int i;
00309   char *current_string;
00310   char current_temp_filename[500];
00311   int current_file_number;
00312   FILE *temp_file;
00313 
00314   flag text_buffer_full;
00315 
00316   char *text_buffer;
00317   char **pointers;
00318 
00319   char current_ngram[500];
00320   int current_count;
00321 
00322   int counter;
00323 
00324   struct utsname uname_info;
00325   char *temp_file_root;
00326   char *temp_file_ext;
00327   char *host_name;
00328   int proc_id;
00329   char temp_word[500];
00330 
00331   flag words_set;
00332   flag chars_set;
00333 
00334   /* Process command line */
00335 
00336   verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
00337   pc_message(verbosity,2,"text2wngram\n");
00338 
00339   report_version(&argc,argv);
00340 
00341   if (pc_flagarg( &argc, argv,"-help")) {
00342     fprintf(stderr,"text2wngram - Convert a text stream to a word n-gram stream.\n");
00343     fprintf(stderr,"Usage : text2wngram [ -n 3 ]\n");
00344     fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
00345     fprintf(stderr,"                    [ -chars %d ]\n",STD_MEM*7000000/11);
00346     fprintf(stderr,"                    [ -words %d ]\n",STD_MEM*1000000/11);
00347     fprintf(stderr,"                    [ -gzip | -compress ]\n");
00348     fprintf(stderr,"                    [ -verbosity 2 ]\n");
00349     fprintf(stderr,"                    < .text > .wngram\n");
00350     exit(1);
00351   }
00352 
00353   n = pc_intarg(&argc, argv,"-n",DEFAULT_N);
00354 
00355   /*  max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11);
00356   max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */
00357 
00358   max_words = pc_intarg(&argc, argv,"-words",-1);
00359   max_chars = pc_intarg(&argc, argv,"-chars",-1);
00360 
00361   if (max_words == -1) {
00362     words_set = 0;
00363     max_words = STD_MEM*1000000/11;
00364   }
00365   else {
00366     words_set = 1;
00367   }
00368 
00369   if (max_chars == -1) {
00370     chars_set = 0;
00371     max_chars = STD_MEM*7000000/11; 
00372   }
00373   else {
00374     chars_set = 1;
00375   }
00376   
00377   max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);
00378 
00379   strcpy(temp_directory,pc_stringarg( &argc, argv, "-temp", DEFAULT_TEMP));
00380 
00381 
00382   if (pc_flagarg(&argc,argv,"-compress")) {
00383     temp_file_ext = salloc(".Z");
00384   }
00385   else {
00386     if (pc_flagarg(&argc,argv,"-gzip")) {
00387       temp_file_ext = salloc(".gz");
00388     }
00389     else {
00390       temp_file_ext = salloc("");
00391     }
00392   }
00393 
00394   uname(&uname_info);
00395 
00396   host_name = salloc(uname_info.nodename);
00397 
00398   proc_id = getpid();
00399 
00400   sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00401 
00402   temp_file_root = salloc(temp_word);
00403 
00404   pc_report_unk_args(&argc,argv,verbosity);
00405  
00406   if (words_set && !chars_set) {
00407     max_chars = max_words * 7;
00408   }
00409 
00410   if (!words_set && chars_set) {
00411     max_words = max_chars / 7;
00412   }
00413 
00414   /* If the last charactor in the directory name isn't a / then add one. */
00415   
00416   if (temp_directory[strlen(temp_directory)-1] != '/') {
00417     strcat(temp_directory,"/");
00418   }
00419   pc_message(verbosity,2,"n = %d\n",n);
00420   pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
00421   pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
00422   pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
00423   pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);
00424 
00425   /* Allocate memory for the buffers */
00426 
00427   text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
00428   pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
00429              sizeof(char)*max_chars);
00430 
00431   pointers = (char **) rr_malloc(sizeof(char *)*max_words);
00432   pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
00433              sizeof(char *)*max_words);
00434 
00435   current_file_number = 0;
00436 
00437   current_word = 1;
00438   start_char = 0;
00439   current_char = 0;
00440   counter = 0;
00441   pointers[0] = text_buffer;
00442       
00443   while (!feof(stdin)) {
00444 
00445     current_file_number++;
00446 
00447     /* Read text into buffer */
00448     
00449     pc_message(verbosity,2,"Reading text into buffer...\n");
00450 
00451     pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
00452     pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
00453     
00454     pointers[0] = text_buffer;
00455     
00456     while ((!rr_feof(stdin)) && 
00457            (current_word < max_words) && 
00458            (current_char < max_chars)) {
00459 
00460       text_buffer[current_char] = getchar();
00461       if (text_buffer[current_char] == '\n' || 
00462           text_buffer[current_char] == '\t' ) {
00463         text_buffer[current_char] = ' ';
00464       }
00465       if (text_buffer[current_char] == ' ') {
00466         if (current_char > start_char) {
00467           if (text_buffer[current_char-1] == ' ') {
00468             current_word--;
00469             current_char--;
00470           }
00471           pointers[current_word] = &(text_buffer[current_char+1]);
00472           current_word++; 
00473           counter++;
00474           if (counter % 20000 == 0) {
00475             if (counter % 1000000 == 0) {
00476               pc_message(verbosity,2,"\n");
00477             }
00478             else {
00479               pc_message(verbosity,2,".");
00480             }
00481           }
00482         }
00483       }
00484       
00485       if (text_buffer[current_char] != ' ' ||
00486           current_char > start_char) {
00487         current_char++;
00488       }
00489     }
00490 
00491     text_buffer[current_char]='\0';
00492 
00493 
00494     if (current_word == max_words || rr_feof(stdin)) {
00495       for (i=current_char+1;i<=max_chars-1;i++) {
00496         text_buffer[i] = ' ';
00497       }
00498       text_buffer_full = 0;
00499     }
00500     else {
00501       text_buffer_full = 1;
00502     }
00503     
00504     /* Sort buffer */
00505 
00506     pc_message(verbosity,2,"\nSorting pointer array...\n"); 
00507 
00508     qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
00509    
00510     /* Write out temporary file */
00511 
00512     sprintf(current_temp_filename,"%s%s%hu%s",temp_directory,temp_file_root,current_file_number,temp_file_ext);
00513 
00514     pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
00515         
00516     temp_file = rr_oopen(current_temp_filename);
00517     text_buffer[current_char] = ' ';
00518     
00519     current_count = 0;
00520     strcpy(current_ngram,"");
00521     
00522     for (i = 0; i <= current_word-n; i++) {
00523       current_string = pointers[i];
00524       
00525       /* Find the nth space */
00526 
00527       no_of_spaces = 0;
00528       pos_in_string = 0;
00529       while (no_of_spaces < n) {
00530         
00531         if (current_string[pos_in_string] == ' ') {
00532           no_of_spaces++;
00533         }
00534         pos_in_string++;
00535       }
00536       
00537       if (!strncmp(current_string,current_ngram,pos_in_string)) {
00538         current_count++;
00539       }
00540       else {
00541         if (strcmp(current_ngram,"")) {
00542           if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0)  {
00543             quit(-1,"Error writing to temporary file %s\n",current_temp_filename);
00544           }
00545         }
00546         current_count = 1;
00547         strncpy(current_ngram,current_string,pos_in_string);
00548         current_ngram[pos_in_string] = '\0';
00549       }
00550     }
00551     
00552     rr_oclose(temp_file);
00553 
00554     /* Move the last n-1 words to the beginning of the buffer, and set
00555        correct current_word and current_char things */
00556 
00557     strcpy(text_buffer,pointers[current_word-n]);
00558     pointers[0]=text_buffer;
00559    
00560     /* Find the (n-1)th space */
00561 
00562     no_of_spaces=0;
00563     pos_in_string=0;
00564 
00565     if (!text_buffer_full){ 
00566       while (no_of_spaces<(n-1)) {
00567         if (pointers[0][pos_in_string]==' ') {
00568           no_of_spaces++;
00569           pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
00570         }
00571         pos_in_string++;
00572       }
00573     }
00574     else {
00575       while (no_of_spaces<n) {
00576         if (pointers[0][pos_in_string]==' ') {
00577           no_of_spaces++;
00578           pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
00579         }
00580         pos_in_string++;
00581       }
00582       pos_in_string--;
00583     }
00584 
00585     current_char = pos_in_string;
00586     current_word = n;
00587     /* mark boundary beyond which counting pass cannot backup */
00588     start_char = current_char;
00589 
00590   }
00591   /* Merge temporary files */
00592 
00593   pc_message(verbosity,2,"Merging temporary files...\n");
00594 
00595   merge_tempfiles(1,
00596                   current_file_number,
00597                   temp_file_root,
00598                   temp_file_ext,
00599                   max_files,
00600                   temp_directory,
00601                   stdout,
00602                   n,
00603                   verbosity); 
00604   pc_message(verbosity,0,"text2wngram : Done.\n");
00605 
00606   exit(0);
00607 
00608 }
00609 

Generated on Tue Dec 21 13:54:46 2004 by doxygen1.2.18