问题描述
我正在尝试使用tsne和pyldavis作为可视化来执行lda主题建模。但是,在获得主要主题后执行lda之后,会给出错误的错误值太多,无法解包。代码和错误如下。非常感谢您的帮助。
LdaMulticore主题建模代码:
import sys
# !{sys.executable} -m spacy download en
import re,numpy as np,pandas as pd
from pprint import pprint
# Gensim
import gensim,spacy,logging,warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize,simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','edu','use','not','would','say','could','_','be','know','good','go','get','do','done','try','many','some','nice','thank','think','see','rather','easy','easily','lot','lack','make','want','seem','run','need','even','right','line','also','may','take','come'])
def Make_String(text):
return str(text)
#Reviews.columns=['Reviews']
#print(Reviews.head(10))
df['text']=df['text'].apply(lambda x: Make_String(x))
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.ERROR)
# Import Dataset
#df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
#df = df.loc[df.target_names.isin(['soc.religion.christian','rec.sport.hockey','talk.politics.mideast','rec.motorcycles']),:]
df=pd.read_csv("/content/drive/My Drive/Negative_data.csv",encoding="ISO-8859-1")
print(df.shape) #> (2361,3)
df.head()
# Create Dictionary
id2word = corpora.Dictionary(data_ready)
from gensim.models import LdaMulticore
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]
lda_model = LdaMulticore( corpus,num_topics=10,id2word=id2word,passes=2,workers=2)
pprint(lda_model.print_topics())
#> [(0,#> '0.017*"write" + 0.015*"people" + 0.014*"organization" + 0.014*"article" + '
#> '0.013*"time" + 0.008*"give" + 0.008*"first" + 0.007*"tell" + 0.007*"new" + '
#> '0.007*"question"'),#> (1,#> '0.008*"christian" + 0.008*"believe" + 0.007*"god" + 0.007*"law" + '
#> '0.006*"state" + 0.006*"israel" + 0.006*"israeli" + 0.005*"exist" + '
#> '0.005*"way" + 0.004*"bible"'),#> (2,#> '0.024*"armenian" + 0.012*"bike" + 0.006*"kill" + 0.006*"work" + '
#> '0.005*"well" + 0.005*"year" + 0.005*"sumgait" + 0.005*"soldier" + '
#> '0.004*"way" + 0.004*"ride"'),#> (3,#> '0.019*"team" + 0.019*"game" + 0.013*"hockey" + 0.010*"player" + '
#> '0.009*"play" + 0.009*"win" + 0.009*"nhl" + 0.009*"year" + 0.009*"hawk" + '
#> '0.009*"season"')]
Output:
[(0,'0.340*"seriously" + 0.017*"time" + 0.015*"samsung" + 0.014*"day" + '
'0.013*"phone" + 0.012*"order" + 0.012*"wait" + 0.011*"week" + 0.011*"damn" '
'+ 0.011*"next"'),(1,'0.081*"puma" + 0.068*"shoe" + 0.046*"adida" + 0.017*"site" + 0.017*"como" + '
'0.014*"wear" + 0.014*"ugly" + 0.011*"shirt" + 0.010*"era" + 0.009*"pumas"'),(2,'0.033*"watch" + 0.021*"hate" + 0.021*"wear" + 0.020*"shit" + 0.020*"buy" + '
'0.016*"game" + 0.014*"man" + 0.014*"stop" + 0.014*"time" + 0.013*"still"'),(3,'0.037*"bad" + 0.014*"year" + 0.013*"pay" + 0.013*"feel" + 0.011*"thing" + '
'0.011*"really" + 0.011*"last" + 0.011*"ever" + 0.009*"never" + '
'0.009*"people"'),(4,'0.332*"com" + 0.173*"twitter" + 0.078*"pic" + 0.036*"status" + '
'0.036*"https" + 0.029*"nintendo" + 0.015*"apple" + 0.008*"pue" + '
'0.006*"photo" + 0.004*"iphone"'),(5,'0.162*"http" + 0.028*"pace" + 0.027*"low" + 0.019*"new" + 0.019*"price" + '
'0.017*"crushed_km" + 0.017*"size" + 0.014*"video" + 0.012*"sale" + '
'0.012*"dlvr"'),(6,'0.062*"nike" + 0.019*"phone" + 0.019*"drop" + 0.018*"work" + 0.013*"tell" + '
'0.013*"hard" + 0.012*"call" + 0.011*"crazy" + 0.011*"lol" + 0.010*"ass"'),(7,'0.036*"sin" + 0.036*"die" + 0.024*"kill" + 0.018*"pero" + 0.012*"android" + '
'0.012*"pro" + 0.009*"death" + 0.008*"igual" + 0.008*"final" + '
'0.008*"problem"'),(8,'0.039*"black" + 0.036*"http" + 0.034*"netflix" + 0.020*"fire" + '
'0.018*"dead" + 0.014*"son" + 0.013*"lose" + 0.011*"tv" + 0.011*"tinyurl" + '
'0.010*"steal"'),(9,'0.299*"live" + 0.295*"alone" + 0.038*"seriously" + 0.013*"switch" + '
'0.008*"mad" + 0.006*"screen" + 0.006*"wrong" + 0.006*"season" + '
'0.005*"hour" + 0.005*"people"')]
主要主题代码:
# Sentence Coloring of N Sentences
def topics_per_document(model,corpus,start=0,end=1):
corpus_sel = corpus[start:end]
dominant_topics = []
topic_percentages = []
for i,corp in enumerate(corpus_sel):
topic_percs,wordid_topics,wordid_phivalues = model[corp]
dominant_topic = sorted(topic_percs,key = lambda x: x[1],reverse=True)[0][0]
dominant_topics.append((i,dominant_topic))
topic_percentages.append(topic_percs)
return(dominant_topics,topic_percentages)
dominant_topics,topic_percentages = topics_per_document(model=lda_model,corpus=corpus,end=-1)
# Distribution of Dominant Topics in Each Document
df = pd.DataFrame(dominant_topics,columns=['Document_Id','Dominant_Topic'])
dominant_topic_in_each_doc = df.groupby('Dominant_Topic').size()
df_dominant_topic_in_each_doc = dominant_topic_in_each_doc.to_frame(name='count').reset_index()
# Total Topic Distribution by actual weight
topic_weightage_by_doc = pd.DataFrame([dict(t) for t in topic_percentages])
df_topic_weightage_by_doc = topic_weightage_by_doc.sum().to_frame(name='count').reset_index()
# Top 3 Keywords for each Topic
topic_top3words = [(i,topic) for i,topics in lda_model.show_topics(formatted=False)
for j,(topic,wt) in enumerate(topics) if j < 3]
df_top3words_stacked = pd.DataFrame(topic_top3words,columns=['topic_id','words'])
df_top3words = df_top3words_stacked.groupby('topic_id').agg(',\n'.join)
df_top3words.reset_index(level=0,inplace=True)
错误:
<ipython-input-13-5ea2ada44643> in topics_per_document(model,start,end)
5 topic_percentages = []
6 for i,corp in enumerate(corpus_sel):
----> 7 topic_percs,wordid_phivalues = model[corp]
8 dominant_topic = sorted(topic_percs,reverse=True)[0][0]
9 dominant_topics.append((i,dominant_topic))
ValueError: too many values to unpack (expected 3)
非常感谢
解决方法
model[corp]
不返回您的代码期望的元组(topic_percs,wordid_topics,wordid_phivalues)
。相反,它返回corp
的隶属度向量,即模型中每个主题从该主题生成corp
的概率。这里的corp
是您在corpus
上迭代时来自enumerate(corpus[0:1])
的单个文档,因此您需要在corpus
中为每个文档获取成员矢量。
从documentation中给出的示例可以看出(对于LdaModel
的父类LdaMulticore
,但它们返回相同的对象):
>>> from gensim.test.utils import common_corpus
>>> from gensim.models.ldamodel import LdaModel
>>> lda = LdaModel(common_corpus,num_topics=10,iterations=1)
>>> doc_bow = [(1,0.3),(2,0.1),(0,0.09)]
>>> doc_lda = lda[doc_bow]
>>> doc_lda
[(0,0.08579318),(1,0.0858944),0.079572774),(3,0.09752562),(4,0.08426655),(5,0.1231114),(6,0.17063272),(7,0.08766636),(8,0.083353266),(9,0.102183744)]
您似乎想打电话
model.get_document_topics(corp)
用于语料库中的每个文档袋单词(您称为corp
)。
这将返回整个文档的主题分布的三元组,
每个词最有可能的主题,
并将每个单词-主题组合的phi相关性值乘以特征长度。
否则,您可以更改
topic_percs,wordid_phivalues = model[corp]
到
topic_percs = model[corp]
甚至更清楚
topic_percs = model.get_document_topics(corp)
如果假设wordid_topics
是每个主题中每个wordid
的概率,
那么您可以调用model.get_topic_terms(topicid)
返回由主题生成的最相关单词的概率对,或者调用model.get_topics()
以获取术语主题矩阵。