问题描述
我有一个训练文件和测试文件,我想使用机器学习算法从推文中检测情感,在这段代码中,我将在阿拉伯语的训练数据集中使用预处理步骤,并在删除 stop_words 时出现此错误!您需要安装阿拉伯语停用词文件还是我可以从 NLTK 导入它?
#csv file for train
df=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-train.csv")
#csv file for test
df_test=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-test-gold.csv")
def stopWordRmove(text):
ar_stop_list = open("ar_stop_word_list.txt","r") # this error appear in this line
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(text)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def noramlize(Tweet):
Tweet = re.sub(r"[إأٱآا]","ا",Tweet)
Tweet = re.sub(r"ى","ي",Tweet)
Tweet = re.sub(r"ؤ","ء",Tweet)
Tweet = re.sub(r"ئ",Tweet)
Tweet = re.sub(r'[^ا-ي ]',"",Tweet)
noise = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # damma
ٌ | # Tanwin damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""",re.VERBOSE)
Tweet = re.sub(noise,'',Tweet)
return Tweet
def stopWordRmove(Tweet):
ar_stop_list = open("ar_stop_word_list.txt","r")
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(Tweet)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def stemming(Tweet):
st = ISRIstemmer()
stemmed_words = []
words = word_tokenize(Tweet)
for w in words:
stemmed_words.append(st.stem(w))
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def prepareDataSets(df):
sentences = []
for index,r in df.iterrows():
text = stopWordRmove(r['Tweet'])
text = noramlize(r['Tweet'])
text = stemming(r['Tweet'])
df_sentences = DataFrame(sentences,columns=['Tweet','Affect Dimension'])
return df_sentences
preprocessed_df = prepareDataSets(df)
FileNotFoundError: [Errno 2] No such file or directory: 'ar_stop_word_list.txt'
如何从阿拉伯语推文中删除停用词?
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)