00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00026
00027
00028
00029 #include <stdio.h>
00030 #include <strings.h>
00031 #include "general.h"
00032 #include "sih.h"
00033
00034 void read_voc(char *filename, int verbosity,
00035 sih_t *p_vocab_ht, char ***p_vocab,
00036 unsigned short *p_vocab_size)
00037
00038 {
00039
00040
00041 char *pperiod;
00042 int vocab_size;
00043
00044 pperiod = rindex(filename,'.');
00045 if (pperiod==NULL) pperiod = filename-1;
00046
00047 if (strcmp(pperiod+1,"vocab_ht")==0) {
00048 FILE *fp=rr_iopen(filename);
00049 sih_val_read_from_file(p_vocab_ht, fp, filename, verbosity);
00050 rr_iclose(fp);
00051 vocab_size = p_vocab_ht->nentries;
00052 if (p_vocab!=NULL) {
00053 get_vocab_from_vocab_ht(p_vocab_ht, vocab_size, verbosity, p_vocab);
00054 *p_vocab[0] = salloc("<UNK>");
00055 }
00056 }
00057 else {
00058 read_wlist_into_siht(filename, verbosity, p_vocab_ht, &vocab_size);
00059 if (p_vocab!=NULL) {
00060 read_wlist_into_array(filename, verbosity, p_vocab, &vocab_size);
00061 *p_vocab[0] = salloc("<UNK>");
00062 }
00063 }
00064
00065 if (p_vocab_size) {
00066 *p_vocab_size = vocab_size;
00067 }
00068
00069 }
00070
00071
00074 void get_vocab_from_vocab_ht(sih_t *ht, int vocab_size, int verbosity, char ***p_vocab)
00075 {
00076 static char rname[]="get_vocab_fm_ht";
00077 char **wlist;
00078 int islot, wordid;
00079
00080 wlist = (char **) rr_malloc((vocab_size+1)*sizeof(char *));
00081
00082 for (islot=0; islot<ht->nslots; islot++) {
00083 wordid = (int) ht->slots[islot].intval;
00084 if (wordid>0) wlist[wordid] = ht->slots[islot].string;
00085 }
00086
00087 for (wordid=1; wordid<=vocab_size; wordid++)
00088 if (wlist[wordid]==NULL)
00089 quit(-1,"%s ERROR: the hash table does not contain wordid %d\n",
00090 rname, wordid);
00091
00092 if (verbosity) fprintf(stderr,
00093 "%s: vocabulary was constructed from the vocab hash table\n",rname);
00094 *p_vocab = wlist;
00095 }