#include "arabic_stemmer.h"
#include "WordSet.hpp"
Defines | |
#define | STEM_TO_WORD 99 |
#define | STEM_WORD 0 |
#define | WAW "0xe6" |
#define | CHAR_WAW 0xe6 |
Functions | |
void | light_stem (char *, char *) |
int | remove_diacritics (char *, char *) |
void | remove_definite_articles (char *, char *) |
char * | substring (const char *, int, int) |
void | freeWordSets () |
void | substring_copy (char dest[], const char *word, int start, int end) |
int | Str_equals (const char *s1, const char *s2) |
int | is_whitespace (const char c) |
void | load_static_files (const char *path) |
void | check_stemmer_files () |
void | remove_all_suffixes (char *word, char *result, size_t lenlimit) |
void | arabic_clean_up (void) |
void | no_stem (char *word, char *result) |
int | on_stop_list (char *word) |
void | arabic_stop (char *word, char *result) |
void | arabic_norm2 (char *word, char *result) |
void | arabic_norm2_stop (char *word, char *result) |
void | arabic_light10 (char *word, char *result) |
void | arabic_light10_stop (char *word, char *result) |
void | show_stemmer_options () |
void * | set_stemmer (char *stemval) |
char * | stem_phrase (char *phrase, int *numtoks, void(*stemmer)(char *, char *)) |
Variables | |
char * | defarticles [] = {"ال", "وال","بال", "كال", "فال", "لل", "\0"} |
char * | suffixes [] = {"ها","ان","ات","ون","ين","يه","ية","ه","ة","ي","\0"} |
stem_info_t | stemtable [NUMSTEMMERS] |
int | files_loaded = 0 |
const int | isWhitespace [256] |
const int | NormChar [256] |
const int | Norm3Char [256] |
const int | ArabicVowel [256] |
WordSet * | stop_words_ht |
char * | arabic_stemdir |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Initial value: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0xe6,0,0,0,0,0,0xec,0xed,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
|
|
|
|
Initial value: { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
Initial value: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0x81,0,0,0,0,0,0,0,0,0x8a,0,0,0x8d,0x8e,0x8f, 0x90,0,0,0,0,0,0,0,0x98,0,0x9a,0,0,0,0,0x9f, 0,0,0,0,0,0,0,0,0,0,0xaa,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0xc0,0xc7,0xc7,0xc7,0xc7,0xc7,0xc7,0xc7,0xc8,0xe5,0xca,0xcb,0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6, 0,0xd8,0xd9,0xda,0xdb, 0,0xdd,0xde,0xdf, 0,0xe1, 0,0xe3,0xe4,0xe5,0xe6, 0, 0, 0, 0, 0,0xed, 0xed, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
Initial value: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0x81,0,0,0,0,0,0,0,0,0x8a,0,0,0x8d,0x8e,0x8f, 0x90,0,0,0,0,0,0,0,0x98,0,0x9a,0,0,0,0,0x9f, 0,0,0,0,0,0,0,0,0,0,0xaa,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0xc0,0xc1,0xc7,0xc7,0xc4,0xc7,0xc6,0xc7,0xc8,0xe5,0xca,0xcb,0xcc,0xcd,0xce,0xcf, 0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6, 0,0xd8,0xd9,0xda,0xdb, 0,0xdd,0xde,0xdf, 0,0xe1, 0,0xe3,0xe4,0xe5,0xe6, 0, 0, 0, 0, 0,0xed, 0xed, 0, 0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
Initial value: { {"none", "none", no_stem}, {"arabic_stop", "arabic_stop", arabic_stop}, {"arabic_norm2", "table normalization", arabic_norm2}, {"arabic_norm2_stop", "table normalization with stopping", arabic_norm2_stop}, {"arabic_light10", "light stemming", arabic_light10}, {"arabic_light10_stop", "light10 and remove stop words", arabic_light10_stop} } |
|
|
|
|