不能将sklearn.compose.ColumnTransformer仅应用于熊猫数据框的一列

问题描述

我定义了一个自定义的tansformer，它接受一个pandas数据框，仅在一个列上应用一个功能，而其余所有列保持不变。在测试过程中，该变压器工作正常，但是当我将其包含在管道中时却无法正常工作。

这是变压器：

import re
from sklearn.base import BaseEstimator,TransformerMixin

class SynopsisCleaner(BaseEstimator,TransformerMixin):
    def __init__(self):
        return None
    
    def fit(self,X,y=None,**fit_params):
        # nothing to learn from data.
        return self
    
    def clean_text(self,text):
        text = text.lower()
        text = re.sub(r'@[a-zA-Z0-9_]+','',text)   
        text = re.sub(r'https?://[A-Za-z0-9./]+',text)   
        text = re.sub(r'www.[^ ]+',text)  
        text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*',text)  
        text = re.sub(r'[^a-zA-Z]',' ',text)   
        text = [token for token in text.split() if len(token) > 2]
        text = ' '.join(text)
        return text
    
    def transform(self,**fit_params):
        for i in range(X.shape[0]):
            X[i] = self.clean_text(X[i])
        return X

当我像这样手动测试时，它按预期运行。

train_synopsis = SynopsisCleaner().transform(train_data['Synopsis'])

但是，当我将其包含在sklearn管道中时：

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# part 1: defining a column transformer that learns on only one column and transforms it
synopsis_clean_col_tran = ColumnTransformer(transformers=[('synopsis_clean_col_tran',SynopsisCleaner(),['Synopsis'])],# set remainder to passthrough to pass along all the un-specified columns untouched to the next steps
                                            remainder='passthrough')

# make a pipeline Now with all the steps
pipe_1 = Pipeline(steps=[('synopsis_cleaning',synopsis_clean_col_tran)])
pipe_1.fit(train_data)

我收到KeyError，如下所示：

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self,key,method,tolerance)
   2890             try:
-> 2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 0

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
16 frames
<ipython-input-10-3396fa5d6092> in <module>()
      6 # make a pipeline Now with all the steps
      7 pipe_1 = Pipeline(steps=[('synopsis_cleaning',synopsis_clean_col_tran)])
----> 8 pipe_1.fit(train_data)

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self,y,**fit_params)
    352                                  self._log_message(len(self.steps) - 1)):
    353             if self._final_estimator != 'passthrough':
--> 354                 self._final_estimator.fit(Xt,**fit_params)
    355         return self
    356 

/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit(self,y)
    482         # we use fit_transform to make sure to set sparse_output_ (for which we
    483         # need the transformed data) to have consistent output type in predict
--> 484         self.fit_transform(X,y=y)
    485         return self
    486 

/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in fit_transform(self,y)
    516         self._validate_remainder(X)
    517 
--> 518         result = self._fit_transform(X,_fit_transform_one)
    519 
    520         if not result:

/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in _fit_transform(self,func,fitted)
    455                     message=self._log_message(name,idx,len(transformers)))
    456                 for idx,(name,trans,column,weight) in enumerate(
--> 457                         self._iter(fitted=fitted,replace_strings=True),1))
    458         except ValueError as e:
    459             if "Expected 2D array,got 1D array instead" in str(e):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self,iterable)
   1027             # remaining jobs.
   1028             self._iterating = False
-> 1029             if self.dispatch_one_batch(iterator):
   1030                 self._iterating = self._original_iterator is not None
   1031 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self,iterator)
    845                 return False
    846             else:
--> 847                 self._dispatch(tasks)
    848                 return True
    849 

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self,batch)
    763         with self._lock:
    764             job_idx = len(self._jobs)
--> 765             job = self._backend.apply_async(batch,callback=cb)
    766             # A job can complete so quickly than its callback is
    767             # called before we get here,causing self._jobs to

/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self,callback)
    206     def apply_async(self,callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self,batch)
    570         # Don't delay the application,to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
    251         with parallel_backend(self._backend,n_jobs=self._n_jobs):
    252             return [func(*args,**kwargs)
--> 253                     for func,args,kwargs in self.items]
    254 
    255     def __reduce__(self):

/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
    251         with parallel_backend(self._backend,kwargs in self.items]
    254 
    255     def __reduce__(self):

/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer,weight,message_clsname,message,**fit_params)
    726     with _print_elapsed_time(message_clsname,message):
    727         if hasattr(transformer,'fit_transform'):
--> 728             res = transformer.fit_transform(X,**fit_params)
    729         else:
    730             res = transformer.fit(X,**fit_params).transform(X)

/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self,**fit_params)
    569         if y is None:
    570             # fit method of arity 1 (unsupervised transformation)
--> 571             return self.fit(X,**fit_params).transform(X)
    572         else:
    573             # fit method of arity 2 (supervised transformation)

<ipython-input-6-004ee595d544> in transform(self,**fit_params)
     20     def transform(self,**fit_params):
     21         for i in range(X.shape[0]):
---> 22             X[i] = self.clean_text(X[i])
     23         return X

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self,key)
   2900             if self.columns.nlevels > 1:
   2901                 return self._getitem_multilevel(key)
-> 2902             indexer = self.columns.get_loc(key)
   2903             if is_integer(indexer):
   2904                 indexer = [indexer]

/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self,tolerance)
   2891                 return self._engine.get_loc(casted_key)
   2892             except KeyError as err:
-> 2893                 raise KeyError(key) from err
   2894 
   2895         if tolerance is not None:

KeyError: 0

我在做什么错了？

编辑1：，不带方括号，列名指定为字符串，这是我看到的错误：

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-11-bdd42b09e2af> in <module>()
      6 # make a pipeline Now with all the steps
      7 pipe_1 = Pipeline(steps=[('synopsis_cleaning',synopsis_clean_col_tran)])
----> 8 pipe_1.fit(train_data)

3 frames
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self,y)
    536 
    537         self._update_fitted_transformers(transformers)
--> 538         self._validate_output(Xs)
    539 
    540         return self._hstack(list(Xs))

/usr/local/lib/python3.6/dist-packages/sklearn/compose/_column_transformer.py in _validate_output(self,result)
    400                 raise ValueError(
    401                     "The output of the '{0}' transformer should be 2D (scipy "
--> 402                     "matrix,array,or pandas DataFrame).".format(name))
    403 
    404     def _validate_features(self,n_features,feature_names):

ValueError: The output of the 'synopsis_clean_col_tran' transformer should be 2D (scipy matrix,or pandas DataFrame).

解决方法

在手动测试中，您通过了系列 train_data['Synopsis']，但列变换器正在通过 Frame train_data[['Synopsis']]。（因此，为弄清错误：X[i]试图获取名为0的列，该列确实不存在。）您应该能够像将括号放在括号中一样容易地解决此问题。变压器的列规格中的'Synopsis'。 From the docs：

...如果转换器期望X为一维数组（矢量），则应使用标量字符串或整数，否则会将二维数组传递给转换器。 ...

也就是说，

synopsis_clean_col_tran = ColumnTransformer(
    transformers=[('synopsis_clean_col_tran',SynopsisCleaner(),'Synopsis')],# set remainder to passthrough to pass along all the un-specified columns untouched to the next steps
    remainder='passthrough',)

Ah，但随后ColumnTransformer抱怨您的变压器的输出是一维的；那真不幸。我认为当时最干净的方法是切换您的transform以同时将输入和输出都视为2D。如果您只需要数据帧作为输入（没有其他sklearn转换器转换为numpy数组），那么使用FunctionTransformer而不是自定义类可以相对简单。

def clean_text_frame(X):
    return X.applymap(clean_text)  # the function "clean_text" currently in your class.

synopsis_clean_col_tran = ColumnTransformer(
    transformers=[('synopsis_clean_col_tran',FunctionTransformer(clean_text_frame),['Synopsis'])],)

pipeline pipeline scikit-learn sklearn-pandas