Main Page   Compound List   File List   Compound Members   File Members  

text2wfreq.c

Go to the documentation of this file.
00001 
00002 /*=====================================================================
00003                 =======   COPYRIGHT NOTICE   =======
00004 Copyright (C) 1996, Carnegie Mellon University, Cambridge University,
00005 Ronald Rosenfeld and Philip Clarkson.
00006 
00007 All rights reserved.
00008 
00009 This software is made available for research purposes only.  It may be
00010 redistributed freely for this purpose, in full or in part, provided
00011 that this entire copyright notice is included on any copies of this
00012 software and applications and derivations thereof.
00013 
00014 This software is provided on an "as is" basis, without warranty of any
00015 kind, either expressed or implied, as to any matter including, but not
00016 limited to warranty of fitness of purpose, or merchantability, or
00017 results obtained from use of this software.
00018 ======================================================================*/
00019 
00024 /* Very strongly based on the program wordfreq, by Gary Cook
00025    (gdc@eng.cam.ac.uk), adapted (with permission) for the sake of
00026    consistency with the rest of the toolkit by Philip Clarkson,
00027    27/9/96 */
00028 
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include "toolkit.h"
00033 #include "rr_libs/general.h"
00034 #include "pc_libs/pc_general.h"
00035 
00036 #define MAX_STRING_LENGTH 501
00037 #define DEFAULT_HASH 1000000
00038 
00042 struct node {
00043 
00044   char *word;
00045   int count;
00046 
00047   struct node *next;
00048 
00049 };
00050 
00054 struct hash_table {
00055 
00056   int size;
00057   struct node **chain;
00058 
00059 };
00060 
00061 /* create a new node, and sets the count to 1 */
00062 struct node *new_node( char *key )
00063 {
00064   struct node *x;
00065 
00066   x = (struct node *) rr_malloc( sizeof( struct node ) );
00067   x->word = (char *) rr_malloc( (strlen( key ) + 1) * sizeof( char ) );
00068   strcpy( x->word, key );
00069   x->count = 1;
00070   return x;
00071 }
00072 
00073 /* create hash table */
00074 void new_hashtable( struct hash_table *table, int M )
00075 {
00076   int i;
00077   
00078   table->size = M;
00079   table->chain = (struct node **) rr_malloc( M * sizeof( struct node *) );
00080   for( i = 0; i < M; i++ ) {
00081     table->chain[i] = new_node( "HEAD_NODE" );
00082     table->chain[i]->next = (struct node *) NULL;
00083   }
00084 }
00085 
00086 /* update linked list */
00087 int update_chain( struct node *t, char *key )
00088 {
00089   struct node *x;
00090   int score;
00091 
00092   while( t->next != NULL ) {
00093     score = strcmp( key, t->next->word ); 
00094     /* move to next node */
00095     if ( score > 0 ) t = t->next;
00096     /* update node */
00097     else if ( score == 0 ) {
00098       t->next->count++;
00099       return 1;
00100     }
00101     /* add new node */
00102     else {
00103       x = new_node( key );
00104       x->next = t->next;
00105       t->next = x;
00106       return 0;
00107     }
00108   }
00109   /* add node at end */
00110   x = new_node( key );
00111   x->next = (struct node *) NULL;
00112   t->next = x;
00113   return 0;
00114 }
00115 
00116 /* print contents of linked list */
00117 void print_chain( struct node *t )
00118 {
00119   t = t->next;  /* don't print head node */
00120   while ( t != NULL ) {
00121     printf( "%s %d\n", t->word, t->count );
00122     t = t->next;
00123   }
00124 }
00125 
00126 /* generate a hash table address from a variable length character */
00127 /* string - from R. Sedgewick, "Algorithms in C++". */
00128 int hash( char *key, int M )
00129 {
00130   unsigned int h; 
00131   char *t = key;
00132 
00133   for( h = 0; *t; t++ )
00134     h = ( 64 * h + *t ) % M;
00135   return h;
00136 }
00137 
00138 /* print hash table contents */
00139 void print( struct hash_table *table )
00140 {
00141   int i;
00142   for( i = 0; i < table->size; i++ )
00143     print_chain( table->chain[i] );
00144 }
00145 
00146 /* update hash table contents */
00147 void update( struct hash_table *table, char *key, int verbosity )
00148 {
00149   int chain;
00150   
00151   chain = hash( key, table->size );
00152   if ( chain < 0 || chain >= table->size ) {
00153     pc_message(verbosity,1,"WARNING : invalid hash address.\n");
00154     pc_message(verbosity,1,"%s ignored\n", key );
00155     return;
00156   }
00157   update_chain( table->chain[ chain ], key );
00158 }
00159 
00160 /* return the nearest prime not smaller than 'num' */
00161 int nearest_prime(int num)
00162 {
00163   int div;
00164   int num_has_divisor = 1;
00165   
00166   if ( num / 2 * 2 == num ) num++; 
00167   for (; num_has_divisor; num += 2 ) {
00168      num_has_divisor=0;
00169      for ( div = 3; div <= num / 3; div++ ) {
00170         if ( ( num / div) * div == num ) {
00171            num_has_divisor = 1;
00172            break;
00173         }
00174      }
00175   }
00176   num -= 2;
00177   return( num );
00178 }
00179 
00180 int main( int argc, char **argv )
00181 {
00182   int init_nwords, hash_size, scanrc;
00183   struct hash_table vocab;
00184   char word[MAX_STRING_LENGTH];
00185   int verbosity;
00186 
00187   if (pc_flagarg( &argc, argv,"-help")) {
00188     fprintf(stderr,"text2wfreq : Generate a word frequency list for text.\n");
00189     fprintf(stderr,"Usage : text2freq [ -hash %d ]\n",DEFAULT_HASH);
00190     fprintf(stderr,"                  [ -verbosity 2 ]\n");
00191     fprintf(stderr,"                  < .text > .wfreq\n");
00192     exit(1);
00193   }
00194 
00195   /* process command line */
00196 
00197   report_version(&argc,argv);
00198   init_nwords = pc_intarg( &argc, argv, "-hash", DEFAULT_HASH );
00199 
00200   verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00201 
00202   pc_report_unk_args(&argc,argv,verbosity);
00203 
00204   pc_message(verbosity,2,"text2wfreq : Reading text from standard input...\n");
00205 
00206   hash_size = nearest_prime( init_nwords );
00207   new_hashtable( &vocab, hash_size );
00208   while( (scanrc = scanf( "%500s", word )) == 1 ) {
00209     if ( strlen( word ) >= 500 ) {
00210       pc_message(verbosity,1,"text2wfreq : WARNING: word too long, will be split: %s...\n",word);
00211     }
00212     if (strlen(word)) {
00213       update( &vocab, word ,verbosity);
00214     }
00215   }
00216   if ( scanrc != EOF ) {
00217     quit(-1,"Error reading input\n");
00218   }
00219   print( &vocab );
00220   pc_message(verbosity,0,"text2wfreq : Done.\n");
00221   return 0;
00222 }
00223 
00224 

Generated on Tue Dec 21 13:54:46 2004 by doxygen1.2.18