问题描述
我是编程新手,在 pmml 中保存模型时遇到了一些麻烦。我有一个数据库,我需要选择属性,然后使用多数票,最后保存在 pmml 中。即使是多数投票部分也能工作,但是当我使用 sklearn2pmml 在最后一行保存模型时,它会出错。
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.metrics import accuracy_score
from sklearn2pmml import make_pmml_pipeline
from sklearn2pmml import sklearn2pmml
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn.ensemble import VotingClassifier
import joblib
url = 'D:/treinamento.CSV'
df = read_csv(url,header=None)
data = df.values
url_test = 'D:/TESTE.CSV'
df_test = read_csv(url_test,header=None)
data_test = df_test.values
X = data[:,:-1]
y = data_test[:,-1]
X_train = data[:,:-1]
X_test = data_test[:,:-1]
y_train = data[:,-1]
y_test = y
#features selection
features1 = [2,5,7]
features2 = [0,1,4,7]
features3 = [0,6]
features4 = [1,4]
numeric_transformer = Pipeline(steps=[('scaler',StandardScaler())])
preprocessor1 = ColumnTransformer(transformers=[('numerical',numeric_transformer,features1)])
preprocessor2 = ColumnTransformer(transformers=[('numerical',features2)])
preprocessor3 = ColumnTransformer(transformers=[('numerical',features3)])
preprocessor4 = ColumnTransformer(transformers=[('numerical',features4)])
pipe1 = PMMLPipeline(steps=[('preprocessor',preprocessor1),('classifier',DecisionTreeClassifier(min_samples_split = 2))])
pipe2 = PMMLPipeline(steps=[('preprocessor',preprocessor2),DecisionTreeClassifier(min_samples_split = 2))])
pipe3 = PMMLPipeline(steps=[('preprocessor',preprocessor3),DecisionTreeClassifier(min_samples_split = 2))])
pipe4 = PMMLPipeline(steps=[('preprocessor',preprocessor4),DecisionTreeClassifier(min_samples_split = 2))])
eclf = VotingClassifier(estimators=[('pipe1',PMMLPipeline(steps=[('preprocessor',DecisionTreeClassifier(min_samples_split = 2))])),('pipe2',('pipe3',('pipe4',DecisionTreeClassifier(min_samples_split = 2))]))],voting='hard',weights=[1,1])
eclf.fit(X_train,y_train)
yhat = eclf.predict(X_test)
accuracy = accuracy_score(y_test,yhat)
print('Accuracy: %.3f' % (accuracy * 100))
sklearn2pmml(eclf,"D:/MesTrado/ARTIGO DRC/dados_pos_revisao/cross validation - dados reavaliados/4 revisao/5 FOLDS/1 FOLD/eclf.pmml",with_repr = True)
65 sklearn2pmml(eclf,"D:/mest/eclf.pmml",with_repr = True)
~\anaconda3\lib\site-packages\sklearn2pmml\__init__.py in sklearn2pmml(pipeline,pmml,user_classpath,with_repr,debug,java_encoding)
222 print("{0}: {1}".format(java_version[0],java_version[1]))
223 if not isinstance(pipeline,PMMLPipeline):
--> 224 raise TypeError("The pipeline object is not an instance of " + PMMLPipeline.__name__ + ". Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline")
225 estimator = pipeline._final_estimator
226 cmd = ["java","-cp",os.pathsep.join(_classpath(user_classpath)),"org.jpmml.sklearn.Main"]
TypeError: The pipeline object is not an instance of PMMLPipeline. Use the 'sklearn2pmml.make_pmml_pipeline(obj)' utility function to translate a regular Scikit-Learn estimator or pipeline to a PMML pipeline
解决方法
管道对象不是 PMMLPipeline 的实例
您是否阅读了 SkLearn2PMML 错误消息?可能不会,因为它清楚地说明了问题所在!
您在完全错误的地方使用了 PMMLPipeline
类。它应该仅用作 VotingClassifier
估算器的最顶层包装器。
请像这样重新组织您的代码:
pipeline = PMMLPipeline([
("classifier",VotingClassifier([
("pipe1",Pipeline(...)),("pipe2",("pipe3",Pipeline(...))
]))
])
sklearn2pmml(pipeline,"pipeline.pmml")