问题描述
我正在尝试在多个列上使用分组聚合,并在数据帧上使用用户定义的功能。 数据帧看起来像这样。我想按[monthEndDate]和[stock]对[volume]和[logRet]组进行一些汇总
数据框看起来像这样
+---------------------------------------------------+
| date monthEndDate stock volume logRet |
+---------------------------------------------------+
| 1990-01-01 1990-01-31 A 1 NA |
| 1990-01-02 1990-01-31 A 2 0.2 |
| 1990-02-01 1990-02-28 A 3 0.3 |
| 1990-02-02 1990-02-28 A 4 0.4 |
| ... ... |
| 1990-01-01 1990-01-31 B 1 NA |
| 1990-01-02 1990-01-31 B 2 0.08 |
| ... ... |
| 1990-02-01 1990-02-28 B 0 0.3 |
| 1990-02-02 1990-02-28 B 3 0.4 |
| ... ... |
+---------------------------------------------------+
我尝试了以下方法:
def check_input(fn):
def wrapper(_s,*args,**kwargs):
if not isinstance(_s,(pd.Series,pd.DataFrame,np.array)):
raise TypeError()
return fn(_s,**kwargs)
wrapper.__name__ = fn.__name__
return wrapper
@check_input
def varLogRet(_s):
return pd.Series({'varLogRet': np.var(_s[np.nonzero(_s)])})
@check_input
def TotRet1M(_s):
return pd.Series({'TotRet1M': np.exp(np.sum(_s))})
@check_input
def avgVolume(_s):
return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
return_m = price_d.groupby(['monthEndDate','TradingItemId']).agg({'logRet': [varLogRet,TotRet1M],'volume': avgVolume})
但是,它失败并出现ValueError:
Traceback (most recent call last):
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py",line 663,in agg_series
return self._aggregate_series_fast(obj,func)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py",line 681,in _aggregate_series_fast
result,counts = grouper.get_result()
File "pandas/_libs/reduction.pyx",line 429,in pandas._libs.reduction.SeriesGrouper.get_result
File "pandas/_libs/reduction.pyx",line 413,in pandas._libs.reduction.SeriesGrouper.get_result
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py",line 894,in <lambda>
f = lambda x: func(x,**kwargs)
File "<ipython-input-15-37583dd006b9>",line 5,in wrapper
return fn(_s,line 22,in avgVolume
return pd.Series({'avgVolume': np.mean(_s[np.nonzero(_s)])})
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py",line 1113,in __getitem__
return self._get_with(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py",line 1127,in _get_with
return self._get_values_tuple(key)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\series.py",line 1172,in _get_values_tuple
raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
During handling of the above exception,another exception occurred:
Traceback (most recent call last):
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\IPython\core\interactiveshell.py",line 3343,in run_code
exec(code_obj,self.user_global_ns,self.user_ns)
File "<ipython-input-19-278a006fcb71>",line 1,in <module>
return_m = price_d.groupby(['monthEndDate','TradingItemId']).agg({'volume': avgVolume})
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py",line 1455,in aggregate
return super().aggregate(arg,**kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py",line 229,in aggregate
result,how = self._aggregate(func,_level=_level,**kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py",line 506,in _aggregate
result = _agg(arg,_agg_1dim)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py",line 456,in _agg
result[fname] = func(fname,agg_how)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\base.py",line 440,in _agg_1dim
return colg.aggregate(how,_level=(_level or 0) + 1)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\generic.py",line 860,in aggregate
return self._python_agg_general(func_or_funcs,**kwargs)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py",line 900,in _python_agg_general
result,counts = self.grouper.agg_series(obj,f)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\ops.py",line 665,in agg_series
return self._aggregate_series_pure_python(obj,line 694,in _aggregate_series_pure_python
res = func(group)
File "C:\Users\xx\PycharmProjects\venvs\factor_model\lib\site-packages\pandas\core\groupby\groupby.py",in _get_values_tuple
raise ValueError("Can only tuple-index with a MultiIndex")
ValueError: Can only tuple-index with a MultiIndex
解决方法
添加.iloc
为我解决了此问题。原因是因为nonzero()
函数返回需要在其中使用iloc
进行选择的数组的索引。
def varLogRet(_s):
return pd.Series({'varLogRet': np.var(_s.iloc[_s.to_numpy().nonzero()])})
def TotRet1M(_s):
return pd.Series({'TotRet1M': np.exp(np.sum(_s))})
def avgVolume(_s):
return pd.Series({'avgVolume': np.mean(_s.iloc[_s.to_numpy().nonzero()])})
temp.\
groupby(['monthEndDate','stock']).\
agg({'logRet': [varLogRet,TotRet1M],'volume': avgVolume})
输出
logRet volume
varLogRet TotRet1M avgVolume
monthEndDate stock
1990-01-31 A 0.0000 1.221403 1.5
1990-02-28 B 0.0025 2.013753 3.5
...
此外,我使用_s.to_numpy().nonzero()
而不是np.nonzero(_s)
只是因为我使用的版本与您不同。建议使用pandas 0.24.0
-https://pandas.pydata.org/pandas-docs/version/0.25.3/reference/api/pandas.Series.nonzero.html#pandas-series-nonzero的版本。