Main Page   Compound List   File List   Compound Members   File Members  

text2idngram.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00020 #define DEFAULT_HASH_SIZE 200000
00021 #define DEFAULT_MAX_FILES 20
00022 #define MAX_N 20
00023 #define TEMP_FILE_ROOT "text2idngram.temp."
00024 
00025 #include <stdio.h>
00026 #include <stdlib.h>
00027 #include <string.h>
00028 #include <sys/types.h>
00029 #include <sys/utsname.h>
00030 #include <unistd.h>
00031 #include "toolkit.h"
00032 #include "rr_libs/general.h"
00033 #include "pc_libs/pc_general.h"
00034 #include "idngram.h"
00035 
00041 void add_to_buffer(unsigned short word_index,
00042                    int ypos,
00043                    int xpos, 
00044                    unsigned short *buffer) {
00045 
00046   
00047   buffer[(n*ypos)+xpos] = word_index;
00048 
00049 }
00050 
00051 unsigned short buffer_contents(int ypos,
00052                                int xpos, 
00053                                unsigned short *buffer) {
00054   
00055   return (buffer[(n*ypos)+xpos]);
00056 
00057 }
00058 
00059 
00060 
00061 /***************************
00062       MAIN FUNCTION
00063  ***************************/
00064 
00065 void main(int argc, char *argv[]) {
00066 
00067   int i,j;
00068 
00069   char *vocab_filename;
00070   FILE *tempfile;
00071   char tempfiles_directory[1000];
00072   int vocab_size;
00073   FILE *vocab_file;
00074 
00075   int verbosity;
00076 
00077   int buffer_size;
00078   int position_in_buffer;
00079   int number_of_tempfiles;
00080   int max_files;
00081   int fof_size;
00082 
00083   unsigned short *buffer;
00084   unsigned short *placeholder;
00085   unsigned short *temp_ngram;
00086   int temp_count;
00087   
00088   char temp_word[MAX_WORD_LENGTH];
00089   char temp_word2[MAX_WORD_LENGTH];
00090 
00091   char *temp_file_root;
00092   char *temp_file_ext;
00093   char *host_name;
00094   int proc_id;
00095   struct utsname uname_info;
00096 
00097   flag write_ascii;
00098 
00099   /* Vocab hash table things */
00100 
00101   struct hash_table vocabulary;
00102   unsigned long hash_size;
00103   unsigned long M;
00104 
00105   tempfile = NULL; /* Just to prevent compilation warnings. */
00106 
00107   report_version(&argc,argv);
00108 
00109   verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00110 
00111   /* Process command line */
00112   
00113   if (pc_flagarg( &argc, argv,"-help") || argc==1) {
00114     fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
00115     fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
00116     fprintf(stderr,"                    [ -buffer 100 ]\n");
00117     fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
00118     fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
00119     fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
00120     fprintf(stderr,"                    [ -gzip | -compress ]\n");
00121     fprintf(stderr,"                    [ -verbosity %d ]\n",
00122             DEFAULT_VERBOSITY);
00123     fprintf(stderr,"                    [ -n 3 ]\n");
00124     fprintf(stderr,"                    [ -write_ascii ]\n");
00125     fprintf(stderr,"                    [ -fof_size 10 ]\n");
00126     exit(1);
00127   }
00128 
00129   pc_message(verbosity,2,"text2idngram\n");
00130 
00131   n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
00132 
00133   placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00134   temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
00135   hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
00136   buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
00137 
00138   write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
00139 
00140   fof_size = pc_intarg(&argc,argv,"-fof_size",10);
00141 
00142   max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
00143 
00144   vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
00145   
00146   if (!strcmp("",vocab_filename)) {
00147     quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
00148   }
00149     
00150   strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
00151                                            DEFAULT_TEMP));
00152 
00153   if (pc_flagarg(&argc,argv,"-compress")) {
00154     temp_file_ext = salloc(".Z");
00155   }
00156   else {
00157     if (pc_flagarg(&argc,argv,"-gzip")) {
00158       temp_file_ext = salloc(".gz");
00159     }
00160     else {
00161       temp_file_ext = salloc("");
00162     }
00163   }
00164 
00165   uname(&uname_info);
00166 
00167   host_name = salloc(uname_info.nodename);
00168 
00169   proc_id = getpid();
00170 
00171   sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);
00172 
00173   temp_file_root = salloc(temp_word);
00174 
00175   pc_report_unk_args(&argc,argv,verbosity);
00176   
00177   /* If the last charactor in the directory name isn't a / then add one. */
00178   
00179   if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
00180     strcat(tempfiles_directory,"/");
00181   }
00182   
00183   pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
00184   pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
00185   pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
00186   pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
00187   pc_message(verbosity,2,"Max open files         : %d\n",max_files);
00188   pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
00189   pc_message(verbosity,2,"n                      : %d\n",n);
00190 
00191   buffer_size *= (1000000/(sizeof(unsigned short)*n));
00192 
00193   /* Allocate memory for hash table */
00194 
00195   fprintf(stderr,"Initialising hash table...\n");
00196 
00197   M = nearest_prime(hash_size);
00198 
00199   new_hashtable(&vocabulary,M);
00200 
00201   /* Read in the vocabulary */
00202 
00203   vocab_size = 0;
00204 
00205   vocab_file = rr_iopen(vocab_filename);
00206 
00207   pc_message(verbosity,2,"Reading vocabulary...\n");
00208 
00209   while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
00210     if (strncmp(temp_word,"##",2)==0) continue;
00211     sscanf (temp_word, "%s ",temp_word2);
00212 
00213     /* Check for repeated words in the vocabulary */
00214 
00215     if (index2(&vocabulary,temp_word2) != 0) {
00216       fprintf(stderr,"======================================================\n");
00217       fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
00218       fprintf(stderr,"=======================================================\n");
00219     }
00220     if (strncmp(temp_word,"#",1)==0) {
00221       fprintf(stderr,"\n\n===========================================================\n");
00222       fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
00223       fprintf(stderr,     ">>> %s <<<\n",temp_word);
00224       fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
00225       fprintf(stderr,     "         (comments must start with '##')\n");
00226       fprintf(stderr,"===========================================================\n\n");
00227     }
00228     vocab_size++;
00229     add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
00230   }
00231 
00232   if (vocab_size > MAX_VOCAB_SIZE) {
00233     quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
00234   }   
00235   
00236   pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");
00237 
00238   buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));
00239 
00240   number_of_tempfiles = 0;
00241 
00242   /* Read text into buffer */
00243 
00244   /* Read in the first ngram */
00245 
00246   position_in_buffer = 0;
00247 
00248   for (i=0;i<=n-1;i++) {
00249     get_word(stdin,temp_word);
00250     add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
00251   }
00252 
00253   while (!rr_feof(stdin)) {
00254 
00255     /* Fill up the buffer */
00256 
00257     pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
00258     pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
00259     while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
00260       position_in_buffer++;
00261       if (position_in_buffer % 20000 == 0) {
00262         if (position_in_buffer % 1000000 == 0) {
00263           pc_message(verbosity,2,".\n");
00264         }
00265         else {
00266           pc_message(verbosity,2,".");
00267         }
00268       }
00269       for (i=1;i<=n-1;i++) {
00270         add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
00271                       position_in_buffer,i-1,buffer);
00272       }
00273       if (get_word(stdin,temp_word) == 1) {
00274         add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
00275                       n-1,buffer);
00276       }
00277     }
00278 
00279     for (i=0;i<=n-1;i++) {
00280       placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
00281     }
00282 
00283     /* Sort buffer */
00284     
00285     pc_message(verbosity,2,"\nSorting n-grams...\n");
00286     
00287     qsort((void*) buffer,(size_t) position_in_buffer,
00288           n*sizeof(unsigned short),compare_ngrams);
00289 
00290     /* Output the buffer to temporary BINARY file */
00291     
00292     number_of_tempfiles++;
00293 
00294     sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
00295             number_of_tempfiles,temp_file_ext);
00296 
00297     pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
00298                temp_word);
00299 
00300     tempfile = rr_oopen(temp_word);
00301 
00302     for (i=0;i<=n-1;i++) {
00303       temp_ngram[i] = buffer_contents(0,i,buffer);
00304       if (temp_ngram[i] > MAX_VOCAB_SIZE) {
00305         quit(-1,"Invalid trigram in buffer.\nAborting");
00306 
00307       }
00308     }
00309     temp_count = 1;
00310 
00311     for (i=1;i<=position_in_buffer;i++) {
00312  
00313       if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
00314         temp_count++;
00315       }
00316       else {
00317         for (j=0;j<=n-1;j++) {
00318           rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
00319                     tempfile,"temporary n-gram ids");
00320           temp_ngram[j] = buffer_contents(i,j,buffer);
00321         }
00322         rr_fwrite(&temp_count,sizeof(int),1,tempfile,
00323                   "temporary n-gram counts");
00324         temp_count = 1;
00325       }
00326     }
00327     
00328     rr_oclose(tempfile);
00329 
00330     for (i=0;i<=n-1;i++) {
00331       add_to_buffer(placeholder[i],0,i,buffer);
00332     }
00333 
00334     position_in_buffer = 0;
00335 
00336   }
00337 
00338   /* Merge the temporary files, and output the result to standard output */
00339 
00340   pc_message(verbosity,2,"Merging temporary files...\n");
00341   
00342   merge_tempfiles(1,
00343                   number_of_tempfiles,
00344                   temp_file_root,
00345                   temp_file_ext,
00346                   max_files,
00347                   tempfiles_directory,
00348                   stdout,
00349                   write_ascii,
00350                   fof_size); 
00351 
00352   pc_message(verbosity,0,"text2idngram : Done.\n");
00353 
00354   exit(0);
00355   
00356 }
00357 

Generated on Tue Dec 21 13:54:46 2004 by doxygen1.2.18