如何基于构造函数的参数创建具有类属性的子类,以供GridSearchCV的估计器使用?

问题描述

我想继承sklearn.svm.LinearSVC并将其用作sklearn.model_selection.gridsearchcv的估计量。我之前有一些关于子类化的问题,我想我根据之前的post和选定的答案进行了修复。

但是,现在我的目标是创建一个sklearn.kernel_approximation.RBFSampler对象作为我的新类的属性。现在这一个示例,我在这里一个更广泛的问题:

问题: @H_502_13@ 最终期望将我的新估算器类与gridsearchcv一起使用,如何基于传递到构造函数中的参数值创建属性(或缺少)?@H_502_13@

到目前为止,我已经尝试过以下操作:

from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.model_selection import gridsearchcv
from sklearn.kernel_approximation import RBFSampler
from sklearn.datasets import load_breast_cancer

RANDOM_STATE = 123


class LinearSVCSub(LinearSVC):
    def __init__(self,penalty='l2',loss='squared_hinge',sampler_gamma=None,sampler_n=None,dual=True,tol=0.0001,C=1.0,multi_class='ovr',fit_intercept=True,intercept_scaling=1,class_weight=None,verbose=0,random_state=None,max_iter=1000):

        super(LinearSVCSub,self).__init__(penalty=penalty,loss=loss,dual=dual,tol=tol,C=C,multi_class=multi_class,fit_intercept=fit_intercept,intercept_scaling=intercept_scaling,class_weight=class_weight,verbose=verbose,random_state=random_state,max_iter=max_iter)

        self.sampler_gamma = sampler_gamma
        self.sampler_n = sampler_n
        
        # I have also tried a conditional statement here instead of
        #  within a separate function create_sampler()
        self.sampler = create_sampler()
       

    def fit(self,X,y,sample_weight=None):
        X = self.transform_this(X)
        super(LinearSVCSub,self).fit(X,sample_weight)
        return self

    def predict(self,X):
        X = self.transform_this(X)
        return super(LinearSVCSub,self).predict(X)

    def score(self,sample_weight=None):
        X = self.transform_this(X)
        return super(LinearSVCSub,self).score(X,sample_weight)

    def decision_function(self,self).decision_function(X)

    def transform_this(self,X):
        if self.sampler is not None:
            X = sampler.fit_transform(X)
        return X
    
    def create_sampler(self):
         # If sampler_gamma and sampler_n have been given,create a sampler
        if (self.sampler_gamma is not None) and (self.sampler_n is not None):
            sampler = RBFSampler(gamma=self.sampler_gamma,n_components=self.sampler_n)
        else:
            sampler = None
        
        return sampler


if __name__ == '__main__':
    data = load_breast_cancer()
    X,y = data.data,data.target

    # Parameter tuning with custom LinearSVC
    param_grid = {'C': [0.00001,0.0005],'dual': (True,False),'random_state': [RANDOM_STATE],'sampler_gamma': [0.90,0.60,0.30],'sampler_n': [10,200]}

    gs_model = gridsearchcv(estimator=LinearSVCSub(),verbose=1,param_grid=param_grid,scoring='roc_auc',n_jobs=-1,cv=2)
    gs_model.fit(X,y)
    gs_model.cv_results_

但是,据我所知heregridsearchcv首先使用认值启动估算器对象,并且其实现与feature_importances_中的sklearn.tree.DecisionTreeClassifier属性类似。

另外,我从上面的代码中得到的错误是:

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-a11420cc931e> in <module>
     66                       'sampler_n': [10,200]}
     67 
---> 68     gs_model = gridsearchcv(estimator=LinearSVCSub(),69                             scoring='roc_auc',cv=2)
     70     gs_model.fit(X,y)

<ipython-input-6-a11420cc931e> in __init__(self,penalty,loss,sampler_gamma,sampler_n,dual,tol,C,multi_class,fit_intercept,intercept_scaling,class_weight,verbose,random_state,max_iter)
     21         self.sampler_n = sampler_n
     22 
---> 23         self.sampler = create_sampler()
     24 
     25 

NameError: name 'create_sampler' is not defined

解决方法

  1. 使用__init__构造函数作为容器来存储属性。
  2. 在方法中执行所有相应的逻辑
from sklearn.datasets import make_classification
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.kernel_approximation import RBFSampler
from sklearn.datasets import load_breast_cancer

RANDOM_STATE = 123


class LinearSVCSub(LinearSVC):
    
    def __init__(self,penalty='l2',loss='squared_hinge',sampler_gamma=None,sampler_n=None,dual=True,tol=0.0001,C=1.0,multi_class='ovr',fit_intercept=True,intercept_scaling=1,class_weight=None,verbose=0,random_state=None,max_iter=1000,sampler=None):

        super(LinearSVCSub,self).__init__(penalty=penalty,loss=loss,dual=dual,tol=tol,C=C,multi_class=multi_class,fit_intercept=fit_intercept,intercept_scaling=intercept_scaling,class_weight=class_weight,verbose=verbose,random_state=random_state,max_iter=max_iter)

        self.sampler_gamma = sampler_gamma
        self.sampler_n = sampler_n
        self.sampler = sampler
       
    def fit(self,X,y,sample_weight=None):
        X = self.transform_this(X)
        super(LinearSVCSub,self).fit(X,sample_weight)
        return self

    def predict(self,X):
        X = self.transform_this(X)
        return super(LinearSVCSub,self).predict(X)

    def score(self,sample_weight=None):
        X = self.transform_this(X)
        return super(LinearSVCSub,self).score(X,sample_weight)

    def decision_function(self,self).decision_function(X)

    def transform_this(self,X):
        if self.sampler:
            X = RBFSampler(gamma=self.sampler_gamma,n_components=self.sampler_n).fit_transform(X)
        return X



data = load_breast_cancer()
X,y = data.data,data.target

# Parameter tuning with custom LinearSVC
param_grid = {'C': [0.00001,0.0005],'dual': (True,False),'random_state': [RANDOM_STATE],'sampler_gamma': [0.90,0.60,0.30],'sampler_n': [10,200],'sampler':[0,1]
             }

gs_model = GridSearchCV(estimator=LinearSVCSub(sampler=1),verbose=1,param_grid=param_grid,scoring='roc_auc',n_jobs=-1,cv=2)
gs_model.fit(X,y)
gs_model.cv_results_