问题描述
我现在正在研究机器学习,我想应用Logistic回归来处理数据。
dataSet链接:enter link description here
但是它报告这样的错误,我不确定如何使值匹配。
Traceback (most recent call last):
File "C:/Users/Minglang.Tuo20/PycharmProjects/clothes/Main.py",line 8,in <module>
class main():
File "C:/Users/Minglang.Tuo20/PycharmProjects/clothes/Main.py",line 28,in main
new_methods.Logistic_Regression()
File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\Meachine_Learning_Methods.py",line 19,in Logistic_Regression
self.file_category['Logistic Regression'] = lr.predict(self.X_train)
File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\frame.py",line 3040,in __setitem__
self._set_item(key,value)
File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\frame.py",line 3116,in _set_item
value = self._sanitize_column(key,line 3764,in _sanitize_column
value = sanitize_index(value,self.index)
File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\internals\construction.py",line 748,in sanitize_index
"Length of values "
ValueError: Length of values (16492) does not match length of index (20615)
在这里,我打印原始的csv.file,它有20615行。
Review Text ... Sentiment
0 Absolutely wonderful - silky and sexy and comf... ... True
1 love this dress! it's sooo pretty. i happene... ... True
3 I love,love,love this jumpsuit. it's fun,fl... ... True
4 This shirt is very flattering to all due to th... ... True
5 I love tracy reese dresses,but this one is no... ... False
... ... ... ...
23478 I was surprised at the positive reviews for th... ... False
23479 So i wasn't sure about ordering this skirt bec... ... True
23480 ... True
23481 I was very happy to snag this dress at such a ... ... True
23485 This dress in a lovely platinum is feminine an... ... True
[20615 rows x 6 columns]
对于我的程序:
首先,我使用熊猫读取csv.file文件,并使用vectorizer分析原始文本。
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
class handle_data():
def __init__(self):
'''read the data from csv'''
self.file = pd.read_csv("DataSet.csv")
self.file_category = self.file[['Review Text','rating','Class Name','Age']]
self.file_category.head()
'''vectorizer and analyze'''
self.vectorizer = CountVectorizer()
self.analyze = self.vectorizer.build_analyzer()
'''train-data and test-data'''
self.X_train = None
self.Y_train = None
self.X_test = None
self.Y_test = None
def CleanData(self):
'''clean the data and classification of words'''
self.file_category['Review Text'] = self.file_category['Review Text'].fillna('')
self.file_category['Word Counts'] = self.file_category['Review Text'].apply(self.wordcounts)
self.file_category.head()
def wordcounts(self,string):
'''calculate the number of emotion(字段解析器)'''
c = {}
if self.analyze(string):
d = {}
words = self.vectorizer.fit_transform([string]).toarray()
vocabulary = self.vectorizer.vocabulary_
for key,value in vocabulary.items():
d[value] = key
#print("vocabulary: ")
#print(d)
for index,i in enumerate(words[0]):
c[d[index]] = i
#print("words[0]")
#print(c)
return c
然后基于这样的Sentiment_Classifier,根据“评分”,我们可以得出结论:口腔是否具有情感指数。
class sentiment_classifier():
def classification(self,file_category):
'''classify the satisfaction of customers'''
file_category = file_category[file_category['rating']!=3]
file_category['Sentiment'] = file_category['rating']>=4
file_category.head()
return file_category
接下来,我将数据分为训练数据和测试数据:
def train_test_data(self,file_category):
'''split the data to train and test'''
train_data,test_data = train_test_split(file_category,train_size=0.8,random_state=0)
X_train = self.vectorizer.fit_transform(train_data['Review Text'])
self.X_train = X_train
Y_train = train_data['Sentiment']
self.Y_train = Y_train
X_test = self.vectorizer.fit_transform(test_data['Review Text'])
self.X_test= X_test
Y_test = test_data["Sentiment"]
self.Y_test = Y_test
最后,我使用Logistic回归构建预测模型。 但是,当我的代码通过Logistic回归处理数据时,它将报告错误:
import datetime as dt
from sklearn.linear_model import LogisticRegression
class meachine_learning_methods():
'''The class contains 4 main methods to train the data,including Logistic_regression,Naive_Bayes,Support Vector Machine and Neural Network'''
def __init__(self,X_train,Y_train,file_category):
self.X_train = X_train
self.Y_train = Y_train
self.file_category = file_category.copy()
def Logistic_Regression(self):
'''The method of Logistic_Regression'''
start = dt.datetime.Now()
lr = LogisticRegression()
lr.fit(self.X_train,self.Y_train)
self.file_category['Logistic Regression'] = lr.predict(self.X_train)
print('Elapsed time: ',str(dt.datetime.Now() - start))
此外,main.class像这样:
from Handle_Data import handle_data
import warnings
from Meachine_Learning_Methods import meachine_learning_methods
from Sentiment_Classifier import sentiment_classifier
class main():
warnings.filterwarnings('ignore')
#Read the data
#Handle Data
new_file = handle_data()
new_file.CleanData()
#print(new_file.file_category)
#Classification the dataset
new_classifiers = sentiment_classifier()
new_file.file_category = new_classifiers.classification(new_file.file_category)
print(new_file.file_category)
#Train the dataSet
new_file.train_test_data(new_file.file_category)
new_methods = meachine_learning_methods(new_file.X_train,new_file.Y_train,new_file.file_category)
new_methods.Logistic_Regression()
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)