问题描述
我正在编写一个用于处理情感分析的python脚本,并对文本进行了预处理,对分类特征进行矢量化处理并拆分了数据集,然后使用LogisticRegression模型,获得了准确性84%
当我上传新的数据集并尝试部署创建的模型时,我获得了 51.84%的准确性
代码:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from nltk.stem import Porterstemmer
from nltk.stem import WordNetLemmatizer
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import gridsearchcv
stop_words = set(stopwords.words('english'))
import joblib
def load_dataset(filename,cols):
dataset = pd.read_csv(filename,encoding='latin-1')
dataset.columns = cols
return dataset
dataset = load_dataset("F:\AIenv\sentiment_analysis\input_2_balanced.csv",["id","label","date","text"])
dataset.head()
dataset['clean_text'] = dataset['text'].apply(processtweet)
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import doc2vec,TaggedDocument
documents = [TaggedDocument(doc,[i]) for i,doc in enumerate(dataset["clean_text"].apply(lambda x: x.split(" ")))]
# train a doc2vec model with our text data
model = doc2vec(documents,vector_size=5,window=2,min_count=1,workers=4)
# transform each document into a vector data
doc2vec_df = dataset["clean_text"].apply(lambda x: model.infer_vector(x.split(" "))).apply(pd.Series)
doc2vec_df.columns = ["doc2vec_vector_" + str(x) for x in doc2vec_df.columns]
dataset = pd.concat([dataset,doc2vec_df],axis=1)
# add tf-idfs columns
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df = 10)
tfidf_result = tfidf.fit_transform(dataset["clean_text"]).toarray()
tfidf_df = pd.DataFrame(tfidf_result,columns = tfidf.get_feature_names())
tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
tfidf_df.index = dataset.index
dataset = pd.concat([dataset,tfidf_df],axis=1)
x = dataset.iloc[:,3]
y = dataset.iloc[:,1]
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.20,random_state = 42)
from sklearn.pipeline import Pipeline
# create pipeline
pipeline = Pipeline([
('bow',CountVectorizer(strip_accents='ascii',stop_words=['english'],lowercase=True)),('tfidf',TfidfTransformer()),('classifier',LogisticRegression(C=15.075475376884423,penalty="l2")),])
# Parameter grid settings for LogisticRegression
parameters = {'bow__ngram_range': [(1,1),(1,2)],'tfidf__use_idf': (True,False),}
grid = gridsearchcv(pipeline,cv=10,param_grid=parameters,verbose=1,n_jobs=-1)
grid.fit(X_train,y_train)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
#get predictions from best model above
y_preds = grid.predict(X_test)
cm = confusion_matrix(y_test,y_preds)
print("accuracy score: ",accuracy_score(y_test,y_preds))
print("\n")
print("confusion matrix: \n",cm)
print("\n")
print(classification_report(y_test,y_preds))
joblib.dump(grid,"F:\\AIenv\\sentiment_analysis\\RF_jupyter.pkl")
RF_Model = joblib.load("F:\\AIenv\\sentiment_analysis\\RF_jupyter.pkl")
test_twtr_preds = RF_Model.predict(test_twtr["clean_text"])
解决方法
我对情感分析中不同类别的表现进行了调查研究。 对于特定的Twitter数据集,我曾经执行过Logistic回归,朴素贝叶斯,支持向量机,k最近邻(KNN)和决策树等模型。 对所选数据集的观察表明,逻辑回归和朴素贝叶斯在所有类型的测试中均具有良好的准确性。接下来是SVM。然后对决策树进行准确分类。结果,KNN在准确性级别上得分最低。 Logistic回归和朴素贝叶斯模型在情感分析和预测方面分别表现更好。 情感分类器(准确度RMSE) LR(78.3541 1.053619) NB(76.764706 1.064738) 支持向量机(73.5835 1.074752) DT(69.2941 1.145234) KNN(62.9476 1.376589)
在这些情况下,特征提取非常关键。
,#这可能会对您有所帮助。
导入要领
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import time
df = pd.read_csv('FilePath',header=0)
X = df['content']
y = df['sentiment']
def lrSentimentAnalysis(n):
# Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(ngram_range=(1,1))
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1,test_size=n)
# Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
# dual = [True,False]
max_iter = [100,110,120,130,140,150]
C = [1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5]
solvers = ['newton-cg','lbfgs','liblinear']
param_grid = dict(max_iter=max_iter,C=C,solver=solvers)
LR1 = LogisticRegression(penalty='l2',multi_class='auto')
grid = GridSearchCV(estimator=LR1,param_grid=param_grid,cv=10,n_jobs=-1)
grid_result = grid.fit(X_train_dtm,y_train)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_,grid_result.best_params_))
y_pred = grid_result.predict(X_test_dtm)
print ('Accuracy Score: ',metrics.accuracy_score(y_test,y_pred) * 100,'%')
# print('Confusion Matrix: ',metrics.confusion_matrix(y_test,y_pred))
# print('MAE:',metrics.mean_absolute_error(y_test,y_pred))
# print('MSE:',metrics.mean_squared_error(y_test,y_pred))
print ('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
return [n,grid_result.best_estimator_.get_params()['max_iter'],grid_result.best_estimator_.get_params()['C'],grid_result.best_estimator_.get_params()['solver']]
def darwConfusionMetrix(accList):
# Using CountVectorizer to convert text into tokens/features
vect = CountVectorizer(ngram_range=(1,test_size=accList[0])
# Using training data to transform text into counts of features for each message
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)
# Accuracy using Logistic Regression Model
LR = LogisticRegression(penalty='l2',max_iter=accList[2],C=accList[3],solver=accList[4])
LR.fit(X_train_dtm,y_train)
y_pred = LR.predict(X_test_dtm)
# creating a heatmap for confusion matrix
data = metrics.confusion_matrix(y_test,y_pred)
df_cm = pd.DataFrame(data,columns=np.unique(y_test),index=np.unique(y_test))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
plt.figure(figsize=(10,7))
sns.set(font_scale=1.4) # for label size
sns.heatmap(df_cm,cmap="Blues",annot=True,annot_kws={"size": 16}) # font size
fig0 = plt.gcf()
fig0.show()
fig0.savefig('FilePath',dpi=100)
def findModelWithBestAccuracy(accList):
accuracyList = []
for item in accList:
accuracyList.append(item[1])
N = accuracyList.index(max(accuracyList))
print('Best Model:',accList[N])
return accList[N]
accList = []
print('Logistic Regression')
print('grid search method for hyperparameter tuning (accurcy by cross validation) ')
for i in range(2,7):
n = i / 10.0
print ("\nsplit ",i - 1,": n=",n)
accList.append(lrSentimentAnalysis(n))
darwConfusionMetrix(findModelWithBestAccuracy(accList))
,
预处理是构建性能良好的分类器的重要组成部分。如果您在训练和测试集性能之间有如此大的差异,则很可能在(测试集的)预处理过程中发生了一些错误。
无需任何编程即可使用分类器 。第二个视频here(如下)显示了如何从邮件中的关键字中分类情感。
您可以访问网络服务insight classifiers并首先尝试免费构建。
,您的新数据可能与您用来训练和测试模型的第一个数据集有很大不同。预处理技术和统计分析将帮助您表征数据并比较不同的数据集。出于各种原因,可以观察到新数据性能不佳,包括:
- 您的初始数据集在统计上并不代表更大的数据集(例如:您的数据集是一个极端案例)
- 过度拟合:您过度训练了模型,其中包含了训练数据的特殊性(噪声)
- 不同的预处理方法
- 不平衡的训练数据集。机器学习技术最适合平衡数据集(训练集中不同类的平均出现率)