Main Page   Namespace List   Class Hierarchy   Alphabetical List   Compound List   File List   Namespace Members   Compound Members   File Members   Related Pages  

arabic_stem.cpp File Reference

#include "arabic_stemmer.h"
#include "WordSet.hpp"

Defines

#define STEM_TO_WORD   99
#define STEM_WORD   0
#define WAW   "0xe6"
#define CHAR_WAW   0xe6

Functions

void light_stem (char *, char *)
int remove_diacritics (char *, char *)
void remove_definite_articles (char *, char *)
char * substring (const char *, int, int)
void freeWordSets ()
void substring_copy (char dest[], const char *word, int start, int end)
int Str_equals (const char *s1, const char *s2)
int is_whitespace (const char c)
void load_static_files (const char *path)
void check_stemmer_files ()
void remove_all_suffixes (char *word, char *result, size_t lenlimit)
void arabic_clean_up (void)
void no_stem (char *word, char *result)
int on_stop_list (char *word)
void arabic_stop (char *word, char *result)
void arabic_norm2 (char *word, char *result)
void arabic_norm2_stop (char *word, char *result)
void arabic_light10 (char *word, char *result)
void arabic_light10_stop (char *word, char *result)
void show_stemmer_options ()
void * set_stemmer (char *stemval)
char * stem_phrase (char *phrase, int *numtoks, void(*stemmer)(char *, char *))

Variables

char * defarticles [] = {"ال", "وال","بال", "كال", "فال", "لل", "\0"}
char * suffixes [] = {"ها","ان","ات","ون","ين","يه","ية","ه","ة","ي","\0"}
stem_info_t stemtable [NUMSTEMMERS]
int files_loaded = 0
const int isWhitespace [256]
const int NormChar [256]
const int Norm3Char [256]
const int ArabicVowel [256]
WordSetstop_words_ht
char * arabic_stemdir

Define Documentation

#define CHAR_WAW   0xe6
 

#define STEM_TO_WORD   99
 

#define STEM_WORD   0
 

#define WAW   "0xe6"
 


Function Documentation

void arabic_clean_up void   
 

void arabic_light10 char *    word,
char *    result
 

void arabic_light10_stop char *    word,
char *    result
 

void arabic_norm2 char *    word,
char *    result
 

void arabic_norm2_stop char *    word,
char *    result
 

void arabic_stop char *    word,
char *    result
 

void check_stemmer_files  
 

void freeWordSets  
 

int is_whitespace const char    c
 

void light_stem char *   ,
char *   
 

void load_static_files const char *    path
 

void no_stem char *    word,
char *    result
 

int on_stop_list char *    word
 

void remove_all_suffixes char *    word,
char *    result,
size_t    lenlimit
 

void remove_definite_articles char *   ,
char *   
 

int remove_diacritics char *   ,
char *   
 

void* set_stemmer char *    stemval
 

void show_stemmer_options  
 

char* stem_phrase char *    phrase,
int *    numtoks,
void(*    stemmer)(char *, char *)
 

int Str_equals const char *    s1,
const char *    s2
 

char * substring const char *   ,
int   ,
int   
 

void substring_copy char    dest[],
const char *    word,
int    start,
int    end
 


Variable Documentation

char* arabic_stemdir
 

const int ArabicVowel[256]
 

Initial value:

 {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
   0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,   0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0xe6,0,0,0,0,0,0xec,0xed,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}

char* defarticles[] = {"ال", "وال","بال", "كال", "فال", "لل", "\0"}
 

int files_loaded = 0
 

const int isWhitespace[256]
 

Initial value:

 {
0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}

const int Norm3Char[256]
 

Initial value:

 {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0x81,0,0,0,0,0,0,0,0,0x8a,0,0,0x8d,0x8e,0x8f,
0x90,0,0,0,0,0,0,0,0x98,0,0x9a,0,0,0,0,0x9f,
0,0,0,0,0,0,0,0,0,0,0xaa,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0xc0,0xc7,0xc7,0xc7,0xc7,0xc7,0xc7,0xc7,0xc8,0xe5,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,   0,0xd8,0xd9,0xda,0xdb,   0,0xdd,0xde,0xdf,
   0,0xe1,   0,0xe3,0xe4,0xe5,0xe6,   0,   0,   0,   0,   0,0xed, 0xed,   0,   0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}

const int NormChar[256]
 

Initial value:

 {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0x81,0,0,0,0,0,0,0,0,0x8a,0,0,0x8d,0x8e,0x8f,
0x90,0,0,0,0,0,0,0,0x98,0,0x9a,0,0,0,0,0x9f,
0,0,0,0,0,0,0,0,0,0,0xaa,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0xc0,0xc1,0xc7,0xc7,0xc4,0xc7,0xc6,0xc7,0xc8,0xe5,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,   0,0xd8,0xd9,0xda,0xdb,   0,0xdd,0xde,0xdf,
   0,0xe1,   0,0xe3,0xe4,0xe5,0xe6,   0,   0,   0,   0,   0,0xed, 0xed,   0,   0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}

stem_info_t stemtable[NUMSTEMMERS]
 

Initial value:

 {
 {"none", "none", no_stem},
 {"arabic_stop", "arabic_stop", arabic_stop},
 {"arabic_norm2", "table normalization", arabic_norm2},
 {"arabic_norm2_stop", "table normalization with stopping", arabic_norm2_stop},
 {"arabic_light10", "light stemming", arabic_light10}, 
 {"arabic_light10_stop", "light10 and remove stop words", arabic_light10_stop}
 }

WordSet* stop_words_ht
 

char* suffixes[] = {"ها","ان","ات","ون","ين","يه","ية","ه","ة","ي","\0"}
 


Generated on Wed Nov 3 12:59:08 2004 for Lemur Toolkit by doxygen1.2.18