是否有用于运行多个分类器的 Python 管道函数？

问题描述

作为一般经验法则，需要在数据集上运行基线模型。我知道 H2O- AutoML 和其他 AUtoML 包会这样做。但是我想尝试使用 Scikit-learn Pipeline，

这是我到目前为止所做的，

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import f1_score,make_scorer
import os
rs = {'random_state': 42}

X_train,X_test,y_train,y_test = train_test_split(features,target,train_size=0.6,**rs)
X_val,y_val,y_test,= train_test_split(X_test,train_size=0.5,**rs)

# Classification - Model Pipeline
def train_models(X_train,X_val,y_test):
    log_reg = LogisticRegression(**rs)
    nb = BernoulliNB()
    knn = KNeighborsClassifier()
    svm = SVC(**rs)
    mlp = MLPClassifier(max_iter=5000,**rs)
    dt = DecisionTreeClassifier(**rs)
    et = ExtraTreesClassifier(**rs)
    rf = RandomForestClassifier(**rs)
    xgb = XGBClassifier(**rs,verbosity=0)
    scorer = make_scorer(f1_score)

    clfs = [('Logistic Regression',log_reg),('Naive Bayes',nb),('K-Nearest Neighbors',knn),('SVM',svm),('MLP',mlp),('Decision Tree',dt),('Extra Trees',et),('Random Forest',rf),('XGBoost',xgb)]
    pipelines = []
    scores_df = pd.DataFrame(columns=['Model','Val_score','F1_score'])
    test_scores = []
    for clf_name,clf in clfs:
        pipeline = Pipeline(steps=[
            ('scaler',StandardScaler()),('classifier',clf)])
        pipeline.fit(X_train,y_train)
        val_score = cross_val_score(pipeline,scoring=scorer,cv=3).mean()
        print(f'{clf_name}\n{"-" * 30}\nModel Val-score: {val_score:.4f}')
        test_score = f1_score(y_test,pipeline.predict(X_test))
        print(f'Model F1-score: {test_score:.4f}\n\n')
        pipelines.append(pipeline)
        scores_df = scores_df.append({'Model': clf_name,'Val_score': val_score,'F1_score': test_score},ignore_index=True)
    return pipelines,scores_df

我只是想从有经验的程序员那里通过讨论来获取一些知识。我只是期待一个建议/参考或一种有效的方式/方法来做到这一点。

为机器学习分类问题制作流水线的有效方法是什么？

解决方法

尝试投票分类器。使用投票方法寻找最佳分类器需要采用集成策略。

  # Instantiate lr
  lr = LogisticRegression(random_state=SEED)

  # Instantiate knn
  knn = KNN(n_neighbors=27)

  # Instantiate dt
  dt = DecisionTreeClassifier(min_samples_leaf=1.3,random_state=SEED)

  # Define the list classifiers
 classifiers = [('Logistic Regression',lr),('K Nearest Neighbours',knn),('Classification Tree',dt)]

from sklearn.ensemble import VotingClassifier
# Instantiate a VotingClassifier vc 
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
 vc.fit(X_train,y_train)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test,y_pred)
print('{:s} : {:.3f}'.format(clf_name,accuracy))

尝试使用 BaggingClassifier 来提高准确性。其他装袋分类器有：AdaBoostClassifer 和 GradientBoostingRegressor。此时你可能要考虑使用 pytorch 来克服精度问题。我看到你有 MLPClassifier，它是一个神经网络，但它不是专门使用的，也没有定义配置。

 from sklearn.ensemble import BaggingClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.metrics import accuracy_score
 from sklearn.model_selection import train_test_split

 SEED=1


 X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,stratify=y,random_state=SEED)


 dt = DecisionTreeClassifier(max_depth=4,min_samples_leaf=1.6,random_state=SEED)

 bc=BaggingClassifier(base_estimator=dt,n_estimators=300,n_jobs=-1)


 bc.fit(X_train,y_train)   

 # Evaluate the test set predictions
 y_pred = bc.predict(X_test)

 # Calculate accuracy score
 accuracy = accuracy_score(y_test,y_pred)
 print('{:s} : {:.3f}'.format(clf_name,accuracy))

通常，构建管道是为了解决需要一个或多个分类器共同工作的特定任务。但是，在您的情况下，有许多分类器独立工作而不是联合工作。如果您想了解有关管道的更多信息，可以查看 Huggingface

中的几个示例

以下是情绪分析任务的管道示例：

>>> from transformers import pipeline

>>> nlp = pipeline("sentiment-analysis")

>>> result = nlp("I hate you")[0]
>>> print(f"label: {result['label']},with score: {round(result['score'],4)}")
label: NEGATIVE,with score: 0.9991

>>> result = nlp("I love you")[0]
>>> print(f"label: {result['label']},4)}")
label: POSITIVE,with score: 0.9999

classification machine-learning pipeline pipeline pipeline pipeline python scikit-learn