问题描述
当前,我的脚本按每个单词( data_words )分隔数据,但是我不想错过两个单词(例如“ Cloud Storage”)中的一些关键句子
我发现' CountVectorizer(ngram_range =(1,2,))'函数可以正确地分隔句子,但是我不知道该如何插补到现有脚本中。
请参考 现有脚本第10行(data_words),在该行中我需要估算'CountVectorizer'
我已经遵循“ https://www.listendata.com/2018/05/sentiment-analysis-using-python.html”到达以下脚本。
下面是我的示例数据(原始数据&Sentence_data&Taxonomy = Df_tx)和语法。
原始文字
用于将原始数据转换为sentance_data的脚本
#split each document by sentences and append one below the other for sentence level categorization and sentiment mapping
sentence_data = pd.DataFrame(columns=['slno','text'])
for d in range(len(df)):
#doc = (df.iloc[d,1].split('.'))
doc = re.split('!|,|\.',str(df.iloc[d,1]))
for s in ((doc)):
temp = {'slno': [df['slno'][d]],'text': [s]}
sentence_data = pd.concat([sentence_data,pd.DataFrame(temp)])
temp = ""
sentence_data.head(5)
#drop empty text rows and export data
sentence_data['text'].replace('',np.nan,inplace=True);
sentence_data.dropna(subset=['text'],inplace=True);
sentence_data.head(15)
新脚本
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english",analyzer='word',ngram_range=(1,2),min_df=0,max_features=5000)
final_bigram_counts = count_vect.fit_transform(sentence_data['text'])
features=count_vect.get_feature_names()
输出:['访问更长','aldo','aldo excel','适用性']
现有脚本:
data = sentence_data
cat2list = list(set(df_tx['Subtopic']))
data['Category'] = 0
mapped_data = pd.DataFrame(columns = ['slno','text','Category']);
temp=pd.DataFrame()
for k in range(len(data)):
comment = remov_punct(data.iloc[k,1])
print(comment)
print(k)
data_words = [str(x.strip()).lower() for x in str(comment).split()]
print(data_words[1:5])
output = []
for l in range(len(df_tx)):
key_flag = False
and_flag = False
not_flag = False
if (str(df_tx['PrimaryKeywords'][l])!='nan'):
kw_clean = (remov_quote(df_tx['PrimaryKeywords'][l]))
if (str(df_tx['AdditionalKeywords'][l])!='nan'):
aw_clean = (remov_quote(df_tx['AdditionalKeywords'][l]))
else:
aw_clean = df_tx['AdditionalKeywords'][l]
if (str(df_tx['ExcludeKeywords'][l])!='nan'):
nw_clean = remov_quote(df_tx['ExcludeKeywords'][l])
else:
nw_clean = df_tx['ExcludeKeywords'][l]
Key_words = 'nan'
and_words = 'nan'
and_words2 = 'nan'
not_words = 'nan'
not_words2 = 'nan'
if(str(kw_clean)!='nan'):
key_words = [str(x.strip()).lower() for x in kw_clean.split(',')]
key_words2 = set(w.lower() for w in key_words)
if(str(aw_clean)!='nan'):
and_words = [str(x.strip()).lower() for x in aw_clean.split(',')]
and_words2 = set(w.lower() for w in and_words)
if(str(nw_clean)!= 'nan'):
not_words = [str(x.strip()).lower() for x in nw_clean.split(',')]
not_words2 = set(w.lower() for w in not_words)
if(str(kw_clean) == 'nan'):
key_flag = False
else:
if set(data_words) & key_words2:
key_flag = True
else:
if(asterix_handler(key_words2,data_words)=='T'):
key_flag = True
if(str(aw_clean)=='nan'):
and_flag = True
else:
if set(data_words) & and_words2:
and_flag = True
else:
if(asterix_handler(and_words2,data_words)=='T'):
and_flag = True
if(str(nw_clean) == 'nan'):
not_flag = False
else:
if set(data_words) & not_words2:
not_flag = True
else:
if(asterix_handler(not_words2,data_words)=='T'):
not_flag = True
if(key_flag == True and and_flag == True and not_flag == False):
output.append(str(df_tx['Subtopic'][l]))
temp = {'slno': [data.iloc[k,0]],'text': [data.iloc[k,1]],'Category': [df_tx['Subtopic'][l]]}
mapped_data = pd.concat([mapped_data,pd.DataFrame(temp)])
mapped_data.to_csv("mapped_data.csv",index = True)
mapped_data.head(2)
输出:['like','the','help','option'] ['my','excel','workbook','was']
谢谢
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)