问题描述
我是使用 scikit-learn 的新手,但我已经设法使用朴素贝叶斯(使用 .csv 文件作为训练文件)创建了用于预测情绪的代码,现在我想将经过训练的代码与新的 . csv 文件,其中包含要正确预测的新信息,但是当我尝试使用与第一个相同的步骤时(因为我必须准备它具有很好的标记和词干),我提出了以下错误:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-10-414670b5f455> in <module>
5
6 #Prediccion muestras
----> 7 r = clf.predict_proba(tfidf_x1)
8 data1['Prediccion'] = r
~\anaconda3\lib\site-packages\sklearn\naive_bayes.py in predict_proba(self,X)
116 order,as they appear in the attribute :term:`classes_`.
117 """
--> 118 return np.exp(self.predict_log_proba(X))
119
120
~\anaconda3\lib\site-packages\sklearn\naive_bayes.py in predict_log_proba(self,X)
96 check_is_fitted(self)
97 X = self._check_X(X)
---> 98 jll = self._joint_log_likelihood(X)
99 # normalize by P(x) = P(f_1,...,f_n)
100 log_prob_x = logsumexp(jll,axis=1)
~\anaconda3\lib\site-packages\sklearn\naive_bayes.py in _joint_log_likelihood(self,X)
775 def _joint_log_likelihood(self,X):
776 """Calculate the posterior log probability of the samples X"""
--> 777 return (safe_sparse_dot(X,self.feature_log_prob_.T) +
778 self.class_log_prior_)
779
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args,**kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k,arg in zip(sig.parameters,args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a,b,dense_output)
151 ret = np.dot(a,b)
152 else:
--> 153 ret = a @ b
154
155 if (sparse.issparse(a) and sparse.issparse(b)
~\anaconda3\lib\site-packages\scipy\sparse\base.py in __matmul__(self,other)
558 raise ValueError("Scalar operands are not allowed,"
559 "use '*' instead")
--> 560 return self.__mul__(other)
561
562 def __rmatmul__(self,other):
~\anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self,other)
514
515 if other.shape[0] != self.shape[1]:
--> 516 raise ValueError('dimension mismatch')
517
518 result = self._mul_multivector(np.asarray(other))
ValueError: dimension mismatch
这是我的代码,我正在尝试使用西班牙语作为基础来实现:
# -⁻- coding: UTF-8 -*-
import pandas as pd,numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from nltk.stem import Porterstemmer
data = pd.read_csv(r"C:\Users\Propietario\Desktop\machine learning comentarios\Naives Bayes muestra 2_ corregido.csv",encoding = 'utf-8',sep = ';',header = 0)
data = data.dropna()
def limpiar_tokenizar(texto):
# Se convierte todo el texto a minúsculas
nuevo_texto = texto.lower()
# Eliminación de signos de puntuación
regex = '[\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~]'
nuevo_texto = re.sub(regex,' ',nuevo_texto)
# Eliminación de números
nuevo_texto = re.sub("\d+",nuevo_texto)
# Eliminación de espacios en blanco múltiples
nuevo_texto = re.sub("\\s+",nuevo_texto)
# Tokenización por palabras individuales
nuevo_texto = nuevo_texto.split(sep = ' ')
# Eliminación de tokens con una longitud < 1
nuevo_texto = [token for token in nuevo_texto if len(token) > 1]
# Realizando stemming
stemming = Porterstemmer()
nuevo_texto = [stemming.stem(word) for word in nuevo_texto]
return(nuevo_texto)
datos_X = data.Comentarios
datos_y = data.Tipo
stop_words = list(stopwords.words('spanish'))
X_train,X_test,y_train,y_test = train_test_split(datos_X,datos_y,test_size = 0.3,random_state = 120)
tfidf_vectorizador = TfidfVectorizer(tokenizer = limpiar_tokenizar,min_df = 3,stop_words = stop_words)
tfidf_vectorizador.fit(X_train)
tfidf_vectorizador.fit(X_test)
tfidf_train = tfidf_vectorizador.transform(X_train)
tfidf_test = tfidf_vectorizador.transform(X_test)
clf = MultinomialNB(alpha = 0.6,fit_prior = False).fit(tfidf_train,y_train)
predicciones_test = clf.predict(X=tfidf_test)
直到这部分一切都是正确的,但当我介绍第二个数据时,问题就出现了。
data1 = pd.read_csv(r"C:\Users\Propietario\Desktop\machine learning comentarios\excels trabajados\Muestras\Muestra1.csv",header = 0)
data1 = data1.dropna()
x1 = data1['Comments']
tfidf_vectorizador.fit(x1)
tfidf_x1 = tfidf_vectorizador.transform(x1)
data1['Prediccion'] = clf.predict_proba(tfidf_x1)
data1.to_excel('Naive Bayes 5 Muestras.xlsx',sheet_name='Muestra1')
所以程序运行正常,直到出现错误信息“tfidf_x1”的预测。 我还必须使用其他步骤才能使其正常运行吗?还是我用错了功能?
我使用的是 python 3.7。
提前致谢。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)