问题描述
我有1000个.txt文件,并计划搜索各种关键字并计算其TF-IDF得分。但是出于某种原因,结果是>1。我对2个.txt文件进行了测试,然后:“我正在学习nfc” 和“您不需要AI” 。对于nfc和AI,TF-IDF应该为0.25,但是当我打开.csv时,它显示为1.4054651081081644。
我必须承认我没有选择最有效的代码方式。我认为错误在于文件夹,因为我最初计划按年份检查文档(2000-2010年的年度报告)。但是我取消了这些计划,并决定将所有年度报告作为一个整体进行检查。我认为文件夹的解决方法仍然是问题。我放置了2个txt。文件放入文件夹“-” 。有没有办法让它正确计数?
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from pathlib import Path
# root dir
root = '/Users/Tom/PycharmProjects/TextMining/'
#
words_to_find = ['AI','nfc']
# tf_idf file writing
wrote_tf_idf_header = False
tf_idf_file_idx = 0
#
vectorizer_tf_idf = TfidfVectorizer(max_df=.80,min_df=1,stop_words='english',use_idf=True,norm=None,vocabulary=words_to_find,ngram_range=(1,3))
vectorizer_cnt = CountVectorizer(stop_words='english',3))
#
years = ['-']
year_folders = [root + folder for folder in years]
# remove previous results file
if os.path.isfile('summary.csv'):
os.remove('summary.csv')
if os.path.isfile('tf_idf.csv'):
os.remove('tf_idf.csv')
#process every folder (for every year)
for year_idx,year_folder in enumerate(year_folders):
# get file paths in folder
file_paths = []
for file in Path(year_folder).rglob("*.txt"):
file_paths.append(file)
# count of files for each year
file_cnt = len(file_paths)
# read every file's text as string
docs_per_year = []
words_in_folder = 0
for txt_file in file_paths:
with open(txt_file,encoding='utf-8',errors="replace") as f:
txt_file_as_string = f.read()
words_in_folder += len(txt_file_as_string.split())
docs_per_year.append(txt_file_as_string)
#
tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform(docs_per_year).toarray()
# tf_idf_documents_as_array = vectorizer_tf_idf.fit_transform([' '.join(docs_per_year)]).toarray()
#
cnt_documents_as_array = vectorizer_cnt.fit_transform(docs_per_year).toarray()
#
with open('summary.csv','a') as f:
f.write('Index;Term;Count;Df;Idf;Rel. Frequency\n')
for idx,word in enumerate(words_to_find):
abs_freq = cnt_documents_as_array[:,idx].sum()
f.write('{};{};{};{};{};{}\n'.format(idx + 1,word,np.count_nonzero(cnt_documents_as_array[:,idx]),abs_freq,vectorizer_tf_idf.idf_[idx],abs_freq / words_in_folder))
f.write('\n')
with open('tf_idf.csv','a') as f:
if not wrote_tf_idf_header:
f.write('{}\n'.format(years[year_idx]))
f.write('Index;Year;File;')
for word in words_to_find:
f.write('{};'.format(word))
f.write('Sum\n')
wrote_tf_idf_header = True
for idx,tf_idfs in enumerate(tf_idf_documents_as_array):
f.write('{};{};{};'.format(tf_idf_file_idx,years[year_idx],file_paths[idx].name))
for word_idx,_ in enumerate(words_to_find):
f.write('{};'.format(tf_idf_documents_as_array[idx][word_idx]))
f.write('{}\n'.format(sum(tf_idf_documents_as_array[idx])))
tf_idf_file_idx += 1
print()
解决方法
我认为错误是,您将规范定义为norm=None
,但是规范应该是documentation中指定的l1
或l2
。