使用 VotingClassifier 在 sklearn2pmml 中保存模型时出错

问题描述

我是编程新手,在 pmml 中保存模型时遇到了一些麻烦。我有一个数据库,我需要选择属性,然后使用多数票,最后保存在 pmml 中。即使是多数投票部分也能工作,但是当我使用 sklearn2pmml 在最后一行保存模型时,它会出错。

from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib

url = 'D:/treinamento.CSV'
df = read_csv(url,header=None)
data = df.values

url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test,header=None)
data_test = df_test.values
   
X = data[:,:-1]
y = data_test[:,-1]

X_train = data[:,:-1]
X_test = data_test[:,:-1]
y_train = data[:,-1]
y_test = y
#features selection
features1 = [2,5,7]
features2 = [0,1,4,7]
features3 = [0,6]
features4 = [1,4]
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical',numeric_transformer,features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical',features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical',features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical',features4)])

pipe1 = PMMLPipeline(steps=[('preprocessor',preprocessor1),('classifier',DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor',preprocessor2),DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor',preprocessor3),DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor',preprocessor4),DecisionTreeClassifier(min_samples_split = 2))])



eclf = VotingClassifier(estimators=[('pipe1',PMMLPipeline(steps=[('preprocessor',DecisionTreeClassifier(min_samples_split = 2))])),('pipe2',('pipe3',('pipe4',DecisionTreeClassifier(min_samples_split = 2))]))],voting='hard',weights=[1,1])

eclf.fit(X_train,y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test,yhat)
print('Accuracy: %.3f' % (accuracy * 100))

sklearn2pmml(eclf,"D:/MesTrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml",with_repr = True)

代码错误

65 sklearn2pmml(eclf,"D:/mest/eclf.pmml",with_repr = True)

~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline,pmml,user_classpath,with_repr,debug,java_encoding)
    222                 print("{0}: {1}".format(java_version[0],java_version[1]))
    223         if not isinstance(pipeline,PMMLPipeline):
--> 224                 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
    225         estimator = pipeline._final_estimator
    226         cmd = ["java","-cp",os.pathsep.join(_classpath(user_classpath)),"org.jpmml.sklearn.Main"]

TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline

解决方法

管道对象不是 PMMLPipeline 的实例

您是否阅读了 SkLearn2PMML 错误消息?可能不会,因为它清楚地说明了问题所在!

您在完全错误的地方使用了 PMMLPipeline 类。它应该仅用作 VotingClassifier 估算器的最顶层包装器

请像这样重新组织您的代码:

pipeline = PMMLPipeline([
  ("classifier",VotingClassifier([
    ("pipe1",Pipeline(...)),("pipe2",("pipe3",Pipeline(...))
  ]))
])
sklearn2pmml(pipeline,"pipeline.pmml")