问题描述
作为一般经验法则,需要在数据集上运行基线模型。我知道 H2O- AutoML 和其他 AUtoML 包会这样做。但是我想尝试使用 Scikit-learn Pipeline,
这是我到目前为止所做的,
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import f1_score,make_scorer
import os
rs = {'random_state': 42}
X_train,X_test,y_train,y_test = train_test_split(features,target,train_size=0.6,**rs)
X_val,y_val,y_test,= train_test_split(X_test,train_size=0.5,**rs)
# Classification - Model Pipeline
def train_models(X_train,X_val,y_test):
log_reg = LogisticRegression(**rs)
nb = BernoulliNB()
knn = KNeighborsClassifier()
svm = SVC(**rs)
mlp = MLPClassifier(max_iter=5000,**rs)
dt = DecisionTreeClassifier(**rs)
et = ExtraTreesClassifier(**rs)
rf = RandomForestClassifier(**rs)
xgb = XGBClassifier(**rs,verbosity=0)
scorer = make_scorer(f1_score)
clfs = [('Logistic Regression',log_reg),('Naive Bayes',nb),('K-Nearest Neighbors',knn),('SVM',svm),('MLP',mlp),('Decision Tree',dt),('Extra Trees',et),('Random Forest',rf),('XGBoost',xgb)]
pipelines = []
scores_df = pd.DataFrame(columns=['Model','Val_score','F1_score'])
test_scores = []
for clf_name,clf in clfs:
pipeline = Pipeline(steps=[
('scaler',StandardScaler()),('classifier',clf)])
pipeline.fit(X_train,y_train)
val_score = cross_val_score(pipeline,scoring=scorer,cv=3).mean()
print(f'{clf_name}\n{"-" * 30}\nModel Val-score: {val_score:.4f}')
test_score = f1_score(y_test,pipeline.predict(X_test))
print(f'Model F1-score: {test_score:.4f}\n\n')
pipelines.append(pipeline)
scores_df = scores_df.append({'Model': clf_name,'Val_score': val_score,'F1_score': test_score},ignore_index=True)
return pipelines,scores_df
我只是想从有经验的程序员那里通过讨论来获取一些知识。我只是期待一个建议/参考或一种有效的方式/方法来做到这一点。
解决方法
尝试投票分类器。使用投票方法寻找最佳分类器需要采用集成策略。
# Instantiate lr
lr = LogisticRegression(random_state=SEED)
# Instantiate knn
knn = KNN(n_neighbors=27)
# Instantiate dt
dt = DecisionTreeClassifier(min_samples_leaf=1.3,random_state=SEED)
# Define the list classifiers
classifiers = [('Logistic Regression',lr),('K Nearest Neighbours',knn),('Classification Tree',dt)]
from sklearn.ensemble import VotingClassifier
# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)
# Fit vc to the training set
vc.fit(X_train,y_train)
# Evaluate the test set predictions
y_pred = vc.predict(X_test)
# Calculate accuracy score
accuracy = accuracy_score(y_test,y_pred)
print('{:s} : {:.3f}'.format(clf_name,accuracy))
尝试使用 BaggingClassifier 来提高准确性。其他装袋分类器有:AdaBoostClassifer 和 GradientBoostingRegressor。此时你可能要考虑使用 pytorch 来克服精度问题。我看到你有 MLPClassifier,它是一个神经网络,但它不是专门使用的,也没有定义配置。
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
SEED=1
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=SEED)
dt = DecisionTreeClassifier(max_depth=4,min_samples_leaf=1.6,random_state=SEED)
bc=BaggingClassifier(base_estimator=dt,n_estimators=300,n_jobs=-1)
bc.fit(X_train,y_train)
# Evaluate the test set predictions
y_pred = bc.predict(X_test)
# Calculate accuracy score
accuracy = accuracy_score(y_test,y_pred)
print('{:s} : {:.3f}'.format(clf_name,accuracy))
,
通常,构建管道是为了解决需要一个或多个分类器共同工作的特定任务。但是,在您的情况下,有许多分类器独立工作而不是联合工作。如果您想了解有关管道的更多信息,可以查看 Huggingface
中的几个示例以下是情绪分析任务的管道示例:
>>> from transformers import pipeline
>>> nlp = pipeline("sentiment-analysis")
>>> result = nlp("I hate you")[0]
>>> print(f"label: {result['label']},with score: {round(result['score'],4)}")
label: NEGATIVE,with score: 0.9991
>>> result = nlp("I love you")[0]
>>> print(f"label: {result['label']},4)}")
label: POSITIVE,with score: 0.9999