00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00024
00025
00026
00027
00028
00029 #include <stdio.h>
00030 #include <stdlib.h>
00031 #include <string.h>
00032 #include "toolkit.h"
00033 #include "rr_libs/general.h"
00034 #include "pc_libs/pc_general.h"
00035
00036 #define MAX_STRING_LENGTH 501
00037 #define DEFAULT_HASH 1000000
00038
00042 struct node {
00043
00044 char *word;
00045 int count;
00046
00047 struct node *next;
00048
00049 };
00050
00054 struct hash_table {
00055
00056 int size;
00057 struct node **chain;
00058
00059 };
00060
00061
00062 struct node *new_node( char *key )
00063 {
00064 struct node *x;
00065
00066 x = (struct node *) rr_malloc( sizeof( struct node ) );
00067 x->word = (char *) rr_malloc( (strlen( key ) + 1) * sizeof( char ) );
00068 strcpy( x->word, key );
00069 x->count = 1;
00070 return x;
00071 }
00072
00073
00074 void new_hashtable( struct hash_table *table, int M )
00075 {
00076 int i;
00077
00078 table->size = M;
00079 table->chain = (struct node **) rr_malloc( M * sizeof( struct node *) );
00080 for( i = 0; i < M; i++ ) {
00081 table->chain[i] = new_node( "HEAD_NODE" );
00082 table->chain[i]->next = (struct node *) NULL;
00083 }
00084 }
00085
00086
00087 int update_chain( struct node *t, char *key )
00088 {
00089 struct node *x;
00090 int score;
00091
00092 while( t->next != NULL ) {
00093 score = strcmp( key, t->next->word );
00094
00095 if ( score > 0 ) t = t->next;
00096
00097 else if ( score == 0 ) {
00098 t->next->count++;
00099 return 1;
00100 }
00101
00102 else {
00103 x = new_node( key );
00104 x->next = t->next;
00105 t->next = x;
00106 return 0;
00107 }
00108 }
00109
00110 x = new_node( key );
00111 x->next = (struct node *) NULL;
00112 t->next = x;
00113 return 0;
00114 }
00115
00116
00117 void print_chain( struct node *t )
00118 {
00119 t = t->next;
00120 while ( t != NULL ) {
00121 printf( "%s %d\n", t->word, t->count );
00122 t = t->next;
00123 }
00124 }
00125
00126
00127
00128 int hash( char *key, int M )
00129 {
00130 unsigned int h;
00131 char *t = key;
00132
00133 for( h = 0; *t; t++ )
00134 h = ( 64 * h + *t ) % M;
00135 return h;
00136 }
00137
00138
00139 void print( struct hash_table *table )
00140 {
00141 int i;
00142 for( i = 0; i < table->size; i++ )
00143 print_chain( table->chain[i] );
00144 }
00145
00146
00147 void update( struct hash_table *table, char *key, int verbosity )
00148 {
00149 int chain;
00150
00151 chain = hash( key, table->size );
00152 if ( chain < 0 || chain >= table->size ) {
00153 pc_message(verbosity,1,"WARNING : invalid hash address.\n");
00154 pc_message(verbosity,1,"%s ignored\n", key );
00155 return;
00156 }
00157 update_chain( table->chain[ chain ], key );
00158 }
00159
00160
00161 int nearest_prime(int num)
00162 {
00163 int div;
00164 int num_has_divisor = 1;
00165
00166 if ( num / 2 * 2 == num ) num++;
00167 for (; num_has_divisor; num += 2 ) {
00168 num_has_divisor=0;
00169 for ( div = 3; div <= num / 3; div++ ) {
00170 if ( ( num / div) * div == num ) {
00171 num_has_divisor = 1;
00172 break;
00173 }
00174 }
00175 }
00176 num -= 2;
00177 return( num );
00178 }
00179
00180 int main( int argc, char **argv )
00181 {
00182 int init_nwords, hash_size, scanrc;
00183 struct hash_table vocab;
00184 char word[MAX_STRING_LENGTH];
00185 int verbosity;
00186
00187 if (pc_flagarg( &argc, argv,"-help")) {
00188 fprintf(stderr,"text2wfreq : Generate a word frequency list for text.\n");
00189 fprintf(stderr,"Usage : text2freq [ -hash %d ]\n",DEFAULT_HASH);
00190 fprintf(stderr," [ -verbosity 2 ]\n");
00191 fprintf(stderr," < .text > .wfreq\n");
00192 exit(1);
00193 }
00194
00195
00196
00197 report_version(&argc,argv);
00198 init_nwords = pc_intarg( &argc, argv, "-hash", DEFAULT_HASH );
00199
00200 verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
00201
00202 pc_report_unk_args(&argc,argv,verbosity);
00203
00204 pc_message(verbosity,2,"text2wfreq : Reading text from standard input...\n");
00205
00206 hash_size = nearest_prime( init_nwords );
00207 new_hashtable( &vocab, hash_size );
00208 while( (scanrc = scanf( "%500s", word )) == 1 ) {
00209 if ( strlen( word ) >= 500 ) {
00210 pc_message(verbosity,1,"text2wfreq : WARNING: word too long, will be split: %s...\n",word);
00211 }
00212 if (strlen(word)) {
00213 update( &vocab, word ,verbosity);
00214 }
00215 }
00216 if ( scanrc != EOF ) {
00217 quit(-1,"Error reading input\n");
00218 }
00219 print( &vocab );
00220 pc_message(verbosity,0,"text2wfreq : Done.\n");
00221 return 0;
00222 }
00223
00224