如何在Logistic回归中匹配值？

问题描述

我现在正在研究机器学习，我想应用Logistic回归来处理数据。

但是它报告这样的错误，我不确定如何使值匹配。

Traceback (most recent call last):
  File "C:/Users/Minglang.Tuo20/PycharmProjects/clothes/Main.py",line 8,in <module>
    class main():
  File "C:/Users/Minglang.Tuo20/PycharmProjects/clothes/Main.py",line 28,in main
    new_methods.Logistic_Regression()
  File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\Meachine_Learning_Methods.py",line 19,in Logistic_Regression
    self.file_category['Logistic Regression'] = lr.predict(self.X_train)
  File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\frame.py",line 3040,in __setitem__
    self._set_item(key,value)
  File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\frame.py",line 3116,in _set_item
    value = self._sanitize_column(key,line 3764,in _sanitize_column
    value = sanitize_index(value,self.index)
  File "C:\Users\Minglang.Tuo20\PycharmProjects\clothes\venv\lib\site-packages\pandas\core\internals\construction.py",line 748,in sanitize_index
    "Length of values "
ValueError: Length of values (16492) does not match length of index (20615)

在这里，我打印原始的csv.file，它有20615行。

                                             Review Text  ...  Sentiment
0      Absolutely wonderful - silky and sexy and comf...  ...       True
1      love this dress!  it's sooo pretty.  i happene...  ...       True
3      I love,love,love this jumpsuit. it's fun,fl...  ...       True
4      This shirt is very flattering to all due to th...  ...       True
5      I love tracy reese dresses,but this one is no...  ...      False
...                                                  ...  ...        ...
23478  I was surprised at the positive reviews for th...  ...      False
23479  So i wasn't sure about ordering this skirt bec...  ...       True
23480                                                     ...       True
23481  I was very happy to snag this dress at such a ...  ...       True
23485  This dress in a lovely platinum is feminine an...  ...       True

[20615 rows x 6 columns]

对于我的程序：

首先，我使用熊猫读取csv.file文件，并使用vectorizer分析原始文本。

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

class handle_data():

    def __init__(self):
        '''read the data from csv'''
        self.file = pd.read_csv("DataSet.csv")
        self.file_category = self.file[['Review Text','rating','Class Name','Age']]
        self.file_category.head()

        '''vectorizer and analyze'''
        self.vectorizer = CountVectorizer()
        self.analyze = self.vectorizer.build_analyzer()

        '''train-data and test-data'''
        self.X_train = None
        self.Y_train = None
        self.X_test = None
        self.Y_test = None






    def CleanData(self):
        '''clean the data and classification of words'''
        self.file_category['Review Text'] = self.file_category['Review Text'].fillna('')
        self.file_category['Word Counts'] = self.file_category['Review Text'].apply(self.wordcounts)
        self.file_category.head()



    def wordcounts(self,string):
        '''calculate the number of emotion(字段解析器）'''
        c = {}
        if self.analyze(string):
            d = {}
            words = self.vectorizer.fit_transform([string]).toarray()
            vocabulary = self.vectorizer.vocabulary_

            for key,value in vocabulary.items():
                d[value] = key
                #print("vocabulary: ")
                #print(d)

            for index,i in enumerate(words[0]):
                c[d[index]] = i
                #print("words[0]")
                #print(c)
        return c

然后基于这样的Sentiment_Classifier，根据“评分”，我们可以得出结论：口腔是否具有情感指数。

class sentiment_classifier():

    def classification(self,file_category):
        '''classify the satisfaction of customers'''
        file_category = file_category[file_category['rating']!=3]
        file_category['Sentiment'] = file_category['rating']>=4
        file_category.head()
        return file_category

接下来，我将数据分为训练数据和测试数据：

    def train_test_data(self,file_category):
        '''split the data to train and test'''
   

        train_data,test_data = train_test_split(file_category,train_size=0.8,random_state=0)

        X_train = self.vectorizer.fit_transform(train_data['Review Text'])
        self.X_train = X_train

        Y_train = train_data['Sentiment']
        self.Y_train = Y_train

        X_test = self.vectorizer.fit_transform(test_data['Review Text'])
        self.X_test= X_test

        Y_test = test_data["Sentiment"]
        self.Y_test = Y_test

最后，我使用Logistic回归构建预测模型。但是，当我的代码通过Logistic回归处理数据时，它将报告错误：

import datetime as dt
from sklearn.linear_model import LogisticRegression
class meachine_learning_methods():
    '''The class contains 4 main methods to train the data,including Logistic_regression,Naive_Bayes,Support Vector Machine and Neural Network'''
    def __init__(self,X_train,Y_train,file_category):
        self.X_train = X_train
        self.Y_train = Y_train
        self.file_category = file_category.copy()

    def Logistic_Regression(self):
        '''The method of Logistic_Regression'''
        start = dt.datetime.Now()
        lr = LogisticRegression()
        lr.fit(self.X_train,self.Y_train)
        self.file_category['Logistic Regression'] = lr.predict(self.X_train)
        print('Elapsed time: ',str(dt.datetime.Now() - start))

此外，main.class像这样：

from Handle_Data import handle_data
import warnings
from Meachine_Learning_Methods import meachine_learning_methods
from Sentiment_Classifier import sentiment_classifier


class main():
    warnings.filterwarnings('ignore')

    #Read the data
    #Handle Data
    new_file = handle_data()
    new_file.CleanData()
    #print(new_file.file_category)



    #Classification the dataset
    new_classifiers = sentiment_classifier()
    new_file.file_category = new_classifiers.classification(new_file.file_category)
    print(new_file.file_category)


    #Train the dataSet
    new_file.train_test_data(new_file.file_category)
    new_methods = meachine_learning_methods(new_file.X_train,new_file.Y_train,new_file.file_category)
    new_methods.Logistic_Regression()

请帮助我解决该错误，谢谢！

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）