在新的看不见的文本上部署文本分类模型

问题描述

我正在研究文本分类问题。我已经附上了我训练过的文本分类模型的一个简单的虚拟片段。

如何在new_text上部署模型？在check_predictions上使用模型时，它会正确分类文本，但是，在使用新数据时，分类是错误的。

这是因为new_text需要被矢量化吗？我缺少基本的东西吗？

from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,precision_score,recall_score

df = pd.read_csv("/Users/veg.csv")
print (df)

X_train,X_test,y_train,y_test = train_test_split(df['Text'],df['Label'],random_state=1,test_size=0.2)
cv = CountVectorizer()

X_train_vectorized = cv.fit_transform(X_train)
X_test_vectorized = cv.transform(X_test)

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_vectorized,y_train)
predictions = naive_bayes.predict(X_test_vectorized)

print("Accuracy score: ",accuracy_score(y_test,predictions))
print('accuracy %s' % accuracy_score(predictions,y_test))
print(classification_report(y_test,predictions))

check_predictions = []
for i in range(len(X_test)):   
    if predictions[i] == 0:
        check_predictions.append('vegetable')
    if predictions[i] == 1:
        check_predictions.append('fruit')
    if predictions[i] == 2:
        check_predictions.append('tree')
        
dummy_df = pd.DataFrame({'actual_label': list(y_test),'prediction': check_predictions,'Text':list(X_test)})
dummy_df.replace(to_replace=0,value='vegetable',inplace=True)
dummy_df.replace(to_replace=1,value='fruit',inplace=True)
dummy_df.replace(to_replace=2,value='tree',inplace=True)
print("DUMMY DF")
print(dummy_df.head(10))

new_data=['carrot','grapes','banana','potato','birch','carrot','birch']

new_predictions = []
for i in range(len(new_data)):    
    if predictions[i] == 0:
        new_predictions.append('vegetable')
    if predictions[i] == 1:
        new_predictions.append('fruit')
    if predictions[i] == 2:
        new_predictions.append('tree')
        
new_df = pd.DataFrame({'actual_label': list(y_test),'prediction': new_predictions,'Text':list(new_data)})        
new_df.replace(to_replace=0,inplace=True)
new_df.replace(to_replace=1,inplace=True)
new_df.replace(to_replace=2,inplace=True)
print("NEW DF")
print(new_df.head(10))

解决方法

无论要输入到模型中的任何（新）文本，都必须经过与训练数据完全相同的预处理步骤-这里的CountVectorizer已与X_train匹配：

new_data_vectorized = cv.transform(new_data) # NOT fit_transform
new_predictions = naive_bayes.predict(new_data_vectorized)

machine-learning nlp python scikit-learn text-classification