问题描述
我正在研究这个官方的sklearn tutorial,如何创建用于文本数据分析的管道,并在以后用于网格搜索。但是,我遇到了一个问题,给定的方法在这种情况下不起作用。
我希望此代码能够正常工作:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
df_Xtrain = pd.DataFrame({'tweet': ['This is a tweet']*10,'label': 0})
y_train = df_Xtrain['label'].to_numpy().ravel()
pipe = Pipeline([
('col_selector',ColumnSelector(cols=('tweet'))),('tfidf',TfidfTransformer()),('bernoulli',BernoulliNB()),])
pipe.fit(df_Xtrain,y_train)
此代码有效:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
# data
df_Xtrain = pd.DataFrame({'tweet': ['This is a tweet']*10,'label': 0})
y_train = df_Xtrain['label'].to_numpy().ravel()
# modelling
mc = 'tweet'
vec_tfidf = TfidfVectorizer()
vec_tfidf.fit(df_Xtrain[mc])
X_train = vec_tfidf.transform(df_Xtrain[mc]).toarray()
model = BernoulliNB()
model.fit(X_train,y_train)
model.predict(X_train)
model.score(X_train,y_train)
问题
如何像上面那样建立用于文本分析的管道?
更新
版本
[('numpy','1.17.5'),('pandas','1.0.5'),('sklearn','0.23.1'),('mlxtend','0.17.0')]
Python 3.7.7
错误日志
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-1-3012ce7245d9> in <module>
19
20
---> 21 pipe.fit(df_Xtrain,y_train)
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self,X,y,**fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X,**fit_params_steps)
331 with _print_elapsed_time('Pipeline',332 self._log_message(len(self.steps) - 1)):
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self,**fit_params_steps)
294 message_clsname='Pipeline',295 message=self._log_message(step_idx),--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/joblib/memory.py in __call__(self,*args,**kwargs)
350
351 def __call__(self,**kwargs):
--> 352 return self.func(*args,**kwargs)
353
354 def call_and_shelve(self,**kwargs):
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer,weight,message_clsname,message,**fit_params)
738 with _print_elapsed_time(message_clsname,message):
739 if hasattr(transformer,'fit_transform'):
--> 740 res = transformer.fit_transform(X,**fit_params)
741 else:
742 res = transformer.fit(X,**fit_params).transform(X)
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self,**fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return self.fit(X,**fit_params).transform(X)
694
695
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/feature_extraction/text.py in fit(self,y)
1429 A matrix of term/token counts.
1430 """
-> 1431 X = check_array(X,accept_sparse=('csr','csc'))
1432 if not sp.issparse(X):
1433 X = sp.csr_matrix(X)
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args,**kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k,arg in zip(sig.parameters,args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array,accept_sparse,accept_large_sparse,dtype,order,copy,force_all_finite,ensure_2d,allow_nd,ensure_min_samples,ensure_min_features,estimator)
597 array = array.astype(dtype,casting="unsafe",copy=False)
598 else:
--> 599 array = np.asarray(array,order=order,dtype=dtype)
600 except ComplexWarning:
601 raise ValueError("Complex data not supported\n"
~/opt/miniconda3/envs/spk/lib/python3.7/site-packages/numpy/core/_asarray.py in asarray(a,order)
83
84 """
---> 85 return array(a,copy=False,order=order)
86
87
ValueError: could not convert string to float: 'This is a tweet'
解决方法
您的代码有2个主要问题-
- 您正在使用
tfidftransformer
,而不使用countvectorizer
。相反,只需使用tfidfvectorizer
即可同时完成两个操作。 - 您的
columnselector
返回2D数组(n,1)
,而tfidfvectorizer
期望返回1D数组(n,)
。可以通过设置参数drop_axis = True
来完成。
进行上述更改,即可正常工作-
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from mlxtend.feature_selection import ColumnSelector
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
df_Xtrain = pd.DataFrame({'tweet': ['This is a tweet']*10,'label': 0})
y_train = df_Xtrain['label'].to_numpy().ravel()
pipe = Pipeline([
('col_selector',ColumnSelector(cols=('tweet'),drop_axis=True)),('tfidf',TfidfVectorizer()),('bernoulli',BernoulliNB()),])
pipe.fit(df_Xtrain,y_train)
Pipeline(steps=[('col_selector',ColumnSelector(cols='tweet',BernoulliNB())])
编辑:对问题的回答-“没有mlxtend包是否有可能?为什么我在这里需要ColumnSelector?仅使用sklearn有解决方案吗?”
是的,正如我在下面提到的,您将必须构建自己的列选择器类(这也是构建自己的转换器以添加到管道中的方式)。
class SelectColumnsTransformer():
def __init__(self,columns=None):
self.columns = columns
def transform(self,X,**transform_params):
cpy_df = X[self.columns].copy()
return cpy_df
def fit(self,y=None,**fit_params):
return self
# Add it to a pipeline
pipe = Pipeline([
('selector',SelectColumnsTransformer([<input col name here>]))
])
有关如何使用它的更多信息,请参考this link。