如何删除阿拉伯语中的停用词?

问题描述

我有一个训练文件和测试文件,我想使用机器学习算法从推文中检测情感,在这代码中,我将在阿拉伯语的训练数据集中使用预处理步骤,并在删除 stop_words 时出现此错误!您需要安装阿拉伯语停用词文件还是我可以从 NLTK 导入它?

#csv file for train
df=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-train.csv")

#csv file for test
df_test=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-test-gold.csv")

def stopWordRmove(text):
    ar_stop_list = open("ar_stop_word_list.txt","r") # this error appear in this line
    stop_words = ar_stop_list.read().split('\n')   
    needed_words = []
    words = word_tokenize(text)
    for w in words:
        if w not in (stop_words):
            needed_words.append(w)
    filtered_sentence = " ".join(needed_words)
    return filtered_sentence

def noramlize(Tweet):
    Tweet = re.sub(r"[إأٱآا]","ا",Tweet)
    Tweet = re.sub(r"ى","ي",Tweet)
    Tweet = re.sub(r"ؤ","ء",Tweet)
    Tweet = re.sub(r"ئ",Tweet)
    Tweet = re.sub(r'[^ا-ي ]',"",Tweet)

    noise = re.compile(""" ّ    | # Tashdid
                         َ    | # Fatha
                         ً    | # Tanwin Fath
                         ُ    | # damma
                         ٌ    | # Tanwin damm
                         ِ    | # Kasra
                         ٍ    | # Tanwin Kasr
                         ْ    | # Sukun
                         ـ     # Tatwil/Kashida
                     """,re.VERBOSE)
    Tweet = re.sub(noise,'',Tweet)
    return Tweet

def stopWordRmove(Tweet):
    ar_stop_list = open("ar_stop_word_list.txt","r")
    stop_words = ar_stop_list.read().split('\n')
    needed_words = []
    words = word_tokenize(Tweet)
    for w in words:
        if w not in (stop_words):
            needed_words.append(w)
    filtered_sentence = " ".join(needed_words)
    return filtered_sentence

def stemming(Tweet):
    st = ISRIstemmer()
    stemmed_words = []
    words = word_tokenize(Tweet)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = " ".join(stemmed_words)
    return stemmed_sentence

def prepareDataSets(df):
    sentences = []
    for index,r in df.iterrows():
        text = stopWordRmove(r['Tweet'])
        text = noramlize(r['Tweet'])
        text = stemming(r['Tweet'])
    
    df_sentences = DataFrame(sentences,columns=['Tweet','Affect Dimension'])
    return df_sentences

preprocessed_df = prepareDataSets(df)
FileNotFoundError: [Errno 2] No such file or directory: 'ar_stop_word_list.txt'

如何从阿拉伯语推文中删除停用词?

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)