使用sklearn进行三次样条回归？

问题描述

我想非常准确地回归一个与单个变量 x 非线性相关的目标。在我的 sklearn 管道中，我使用：

pipe = Pipeline([('poly',polynomialFeatures(3,include_bias=False)),\
                     ('regr',ElasticNet(random_state=0))])

这似乎在准确性方面给出了与 np.polyfit(x,y,3) 相似的结果。但是，我基本上可以通过使用三次样条来达到机器精度。见下图，我展示了数据和各种拟合，以及残差。 [注意：下面的例子有 50 个样本。我实际有 2000 个样本]

我有两个问题：

知道为什么 polyfit 或 polyfeat + Elasticnet 无法达到相同水平的准确度吗？
在 scikit-learn 中使用三次样条进行目标回归的任何示例？

    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy.interpolate import interp1d
    %matplotlib inline
    
    data = pd.read_csv('example.txt') # added to this post below
    
    p = np.polyfit(data['x'],data['y'],3)
    data['polyfit'] = np.poly1d(p)(data['x'])
    f = interp1d(data['x'],kind='cubic')
    data['spline'] = f(data['x'])
    
    fig,axes = plt.subplots(nrows=3,sharex=True)
    
    axes[0].plot(data['x'],data['polyfit'],'.',label='polyfit')
    axes[0].plot(data['x'],data['spline'],label='spline')
    axes[0].plot(data['x'],label='true')
    axes[0].legend()
    
    axes[1].plot(data['x'],data['polyfit']-data['y'],label='error polyfit')
    axes[1].legend()
    
    axes[2].plot(data['x'],data['spline']-data['y'],label='error spline')
    axes[2].legend()
    
    plt.show()

数据如下：

example.txt：

,x,y
257,6.26028462060192,-1233.748349982897
944,4.557099191827032,928.1430280794456
1560,6.765081341690966,-1807.9090703632864
504,4.0015671921214775,1683.311523022658
1499,3.0496689401255783,3055.291788377236
1247,5.608726443314061,-441.9226126757499
1856,4.6124942196224845,845.129184983355
1495,1.273838638033053,5479.078773760113
1052,5.353775782315625,-115.14032709875217
247,2.6495185259267076,3656.7467318648232
1841,9.73337795053495,-4884.806993807511
1574,1.1772247845133335,5544.080005636716
1116,5.698561786140379,-555.3435567718
1489,4.184371293153768,1427.6922357286753
603,1.568868565047676,5179.156099377357
358,4.534081088923849,960.3983442022857
774,9.304809492028289,-4468.215701489676
1525,9.17423541311121,-4340.565494266174
1159,6.705834877066449,-1750.189447626367
1959,3.0431599461645207,3065.358649171256
1086,1.3861557136230234,5378.274828554064
81,4.728366950632029,682.7245723055514
1791,6.954198834068505,-2027.0414501796324
234,2.8672306789699844,3330.7282514295102
1850,2.0086469278742363,4603.0931759401155
1531,9.843164998128215,-4973.735518791005
903,1.534448692052103,5220.331847067942
1258,7.243723209152924,-2354.629822080041
645,2.3302780902754514,4128.077572586273
1425,3.295574067849755,2694.766296765896
311,2.3225198086033756,4152.206609684557
219,8.479436097125713,-3665.2515034579396
1917,7.1524135031820135,-2253.3455629418195
1412,6.79800860136838,-1861.3756670478142
705,1.9001265482939966,4756.283634364785
663,3.441268690856777,2489.7632239249424
1871,6.473544271091015,-1480.6593600880415
1897,8.217615163361007,-3386.5427698021977
558,6.609652057181176,-1634.1672307700298
553,5.679571371137544,-524.352981663938
1847,6.487178186324092,-1500.1891501936236
752,9.377368455681758,-4548.188126821915
1469,8.586759667609758,-3771.691600599668
1794,6.649801445466815,-1674.4870918398076
968,1.6226439291315056,5117.8804886837
108,3.0077346937655647,3118.0786841570025
96,6.278616413290749,-1245.4758811316083
994,7.631678455127069,-2767.3224262153176
871,2.6696610777085863,3630.02481913033
1405,9.209358577104299,-4368.622350004463

解决方法

将单个三次方程用于数据的前两种方法，但是（顾名思义）interp1d 用三次样条插值数据：即存在三次曲线对于每个连续的点对，保证完美匹配（达到计算精度）。

我想对我之前的问题进行跟进。可以使用 scikit-learn 拟合基于 B-spline 的模型，该模型具有有限的复杂性（预定义的样条数 - 不像 interp1d 那样随着点数增长）。以下代码取自 robust_splines_sklearn.py。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import splrep,splev

from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression,ElasticNet

from sklearn.metrics import mean_squared_error

def get_bspline_basis(knots,degree=3,periodic=False):
    """Get spline coefficients for each basis spline."""
    nknots = len(knots)
    y_dummy = np.zeros(nknots)

    knots,coeffs,degree = splrep(knots,y_dummy,k=degree,per=periodic)
    ncoeffs = len(coeffs)
    bsplines = []
    for ispline in range(nknots):
        coeffs = [1.0 if ispl == ispline else 0.0 for ispl in range(ncoeffs)]
        bsplines.append((knots,degree))
    return bsplines

class BSplineFeatures(TransformerMixin):
    def __init__(self,knots,periodic=False):
        self.bsplines = get_bspline_basis(knots,degree,periodic=periodic)
        self.nsplines = len(self.bsplines)

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        nsamples,nfeatures = X.shape
        features = np.zeros((nsamples,nfeatures * self.nsplines))
        for ispline,spline in enumerate(self.bsplines):
            istart = ispline * nfeatures
            iend = (ispline + 1) * nfeatures
            features[:,istart:iend] = splev(X,spline)
        return features

    
data = pd.read_csv('example.txt') 
knots = np.array([1,1.5,2,3,4,5,7.5,10])

bspline_features = BSplineFeatures(knots,periodic=False)
base_regressor = LinearRegression()


pipe1 = Pipeline([('poly',PolynomialFeatures(3,include_bias=False)),\
                  ('regr',ElasticNet(random_state=0))])

pipe2 = Pipeline([('feature',bspline_features),\
                  ('regression',base_regressor)])

models = {"polyfit": pipe1,"spline": pipe2}
    
X = data['x'].to_numpy().reshape((data.shape[0],1))
y = data['y']

for label,model in models.items():
    model.fit(X,y)
    data[label] = model.predict(X)
    
fig,axes = plt.subplots(nrows=3,sharex=True)

axes[0].plot(data['x'],data['polyfit'],'.',label='polyfit')
axes[0].plot(data['x'],data['spline'],label='spline')
axes[0].plot(data['x'],data['y'],label='true')
axes[0].legend()

axes[1].plot(data['x'],data['polyfit']-data['y'],label='error polyfit')
axes[1].legend()

axes[2].plot(data['x'],data['spline']-data['y'],label='error spline')
axes[2].legend()

plt.show()

cubic-spline python scikit-learn