尝试通过回归预测算法的运行时间

问题描述

我正在关注这篇论文: http://robotics.stanford.edu/users/shoham/www%20papers/Empirical%20Hardness.pdf

我尝试在黑盒求解器上预测旅行商问题的运行时间。

我在回归过程中得到了一些我很想咨询的奇怪结果:

  1. 我很难相信在 XGBOOST 或任何回归器中,城市数量与特征无关?如 XGBOOST 特征重要性图像所示。
  2. 在 RIDGE 和 LINEAR REGRESSION 结果图中,您可以看到对于某些问题实例,图中您可以看到预测值为负(当我们谈论运行时间时),我在这里的其他问题中看到这是因为“线性回归不尊重 0" 的界限,我应该在它上面放一个自然对数,但我不知道确切的位置。所以我也很乐意帮忙。
  3. 我也很乐意被推荐使用其他可能适合我的问题的回归模型。

非常感谢!

这是我的代码片段(google colab),然后是我得到的结果:

1 # 导入pandas的标准库。

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
from google.colab import files

2 # 安装求解器并导入它的库,另外导入所有的 # 个库,我们将用它来准备功能

!pip3 install ortools 
!pip install python-igraph

from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp

import numpy as np
import time
import random 
from random import randrange
from scipy import stats
from scipy.stats import skew
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import depth_first_tree
from igraph import Graph,mean
import igraph
import itertools
import math

3 # 城市之间的简单旅行商问题 - 求解器或 Google 工具。

def create_data_model():
    # Stores the data for the problem. 
    data = {}
    # dim will be the number of Vertices\Cities in the Traveling Salesman Problem.
    # Randomly select the matrix dimension in unifom distribution.
    dim = np.random.randint(10,350) 
    # Generate a square symmetric matrix It will be the distance matrix that the solver will solve.
    square_matrice = [[0 for row in range(dim)] for col in range(dim)]
    for i in range(dim):
        for j in range(dim):
            if i == j:
                square_matrice[i][j] = 0
            else:
                # Randomly fill the matrix in unifom distribution.
                square_matrice[i][j] = square_matrice[j][i] = np.random.randint(1,1000) 
    data['distance_matrix'] = square_matrice # yapf: disable
    data['num_vehicles'] = 1
    data['depot'] = 0
    return data


def main():
    # Start measuring solution time.
    start_time = time.time()
    # Instantiate the data problem.
    data = create_data_model()
    # Create the routing index manager.
    manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),data['num_vehicles'],data['depot'])
    # Create Routing Model.
    routing = pywrapcp.RoutingModel(manager)
    def distance_callback(from_index,to_index):
        # Returns the distance between the two nodes. 
        # Convert from routing variable Index to distance matrix NodeIndex.
        from_node = manager.IndexToNode(from_index)
        to_node = manager.IndexToNode(to_index)
        return data['distance_matrix'][from_node][to_node]
    transit_callback_index = routing.RegisterTransitCallback(distance_callback)
    # Define cost of each arc.
    routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
    # Setting first solution heuristic.
    search_parameters = pywrapcp.DefaultRoutingSearchParameters()
    search_parameters.first_solution_strategy = (
        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEApest_ARC)
    # Solve the problem.
    solution = routing.solveWithParameters(search_parameters)
    solution_time = time.time() - start_time

    '''In this part of the code we will create the following features on the distance matrix of the problem.
    *  Mean - Average weights of the distance matrix.
    *  Std - Standard Deviation of the distance matrix.
    *  Skewness  - What is the tendency of the weights in the distance matrix.
    *  Noc - Number of cities we have in the distance matrix [matrix dimension].
    *  Td - The total distance of the solution rout. 
    *  Dmft - distance matrix features time,That is how long it took us to calculate all these features.
    '''
    dmt_start_time = time.time()
    mat = np.array(data['distance_matrix'])
    mean = mat.mean()
    std = mat.std()
    merged = list(itertools.chain(*mat))
    skewness = skew(merged)
    noc = len(data['distance_matrix'])
    td = solution.ObjectiveValue() if solution else -1
    dmft = time.time() - dmt_start_time

    '''In this part of the code we will create from the distance matrix of the problem an MST and than
    on the MST we take the following features.
    *  MST_Mean - Average weights of the MST.
    *  MST_Std - Standard Deviation of the MST.
    *  MST_Skewness  - What is the tendency of the weights in the MST.
    *  MST_ft - MST features time,That is how long it took us to calculate the MST & all these features.
    '''
    spt_start_time = time.time()
    X = csr_matrix(mat)
    Tcsr = minimum_spanning_tree(X)
    mat_st = np.array(Tcsr.toarray().astype(int))
    mst_mean = mat_st.mean()
    mst_std = mat_st.std()
    merged_st = list(itertools.chain(*mat_st))
    mst_skewness = skew(merged_st)
    mst_ft = time.time() - spt_start_time

    '''In this part of the code we calculate features from the MST that are considered to be
    related to the rank and depth of the tracks in it.
    *  D_Mean - Average degree of the MST.
    *  D_Std - Standard Deviation of the MST degrees.
    *  D_Skewness  - What is the tendency of the degrees in the MST.
    *  DFT_Mean - The average weight of the deepest track in MST.
    *  DFT_Std - Standard Deviation of the deepest track in MST.
    *  DFT_Max - The heaviest arch on the longest route in MST.
    *  DDFT_ft - Degree & DFT features time,That is how long it took us to calculate all these features.
    '''
    dstt_start_time = time.time()
    g = Graph.Weighted_Adjacency(mat_st.tolist())
    d_mean = igraph.statistics.mean(g.degree())
    d_std = igraph.statistics.sd(g.degree())
    d_skewness = skew(g.degree())
    d_t = depth_first_tree(X,directed=False)
    mat_dt = np.array(d_t.toarray().astype(int))
    dft_mean = mat_dt.mean()
    dft_std = mat_dt.std()
    dft_max = np.amax(mat_dt)
    ddft_ft = time.time() - dstt_start_time

    # In this map we will hold all the features and their results.
    features_map = {'Mean': mean,'Std': std,'Skewness': skewness,'Noc': noc,'Td': td,'Dmft': dmft,'MST_Mean': mst_mean,'MST_Std': mst_std,'MST_Skewness': mst_skewness,'MST_ft': mst_ft,'D_Mean': d_mean,'D_Std': d_std,'D_Skewness': d_skewness,'DFT_Mean': dft_mean,'DFT_Std': dft_std,'DFT_Max': dft_max,'DDFT_ft': ddft_ft,'Solution_time': solution_time}

    return features_map

# Main    
# Create dataFrame.
data_TSP = pd.DataFrame()
# Fill the dataFrame.
for i in range(10000):
    #print(i)
    features_map = main()
    data_TSP = data_TSP.append(features_map,ignore_index=True) 
# Show data frame.

data_TSP.head()
data_TSP.to_csv('data_10000.csv')
files.download('data_10000.csv')

回归模型:

# Import the standard libraries of pandas.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')

2 # 需要打开驱动器中的数据文件

from google.colab import files
uploaded = files.upload()
import io

df = pd.read_csv(io.BytesIO(uploaded['data_10000_clean.csv']))
try:
    df.drop(['Unnamed: 0'],axis=1,inplace=True) 
except: 
    pass
df.head()

从 sklearn.model_selection 导入 train_test_split

# Split the data to training set and test set (70%,30%)
features = list(df.drop('Solution_time',axis = 1,inplace = False))
y = df['Solution_time']
X = df[features]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

导入我们用解决方案预测的模型,

以及评估这些模型的评分方法

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import gridsearchcv
from sklearn import linear_model
!pip3 install xgboost
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
!pip install scikit-plot
import scikitplot as skplt
import matplotlib as mpl

########################################## Several functions for different regression  models. ##########################################

class score:

    r2 = 0.0                   # This score determines how close the predictions really are to the real data.
    cross_vali_score = 0.0     # How true is our algorithm if it going to well predict new data.

class Regressor:

    def __init__(self,name):
        self.name = name
        self.score = score()
        self.y_pred = None
        self.reg = None

# Map between the name of a model and the model itself.
models_map = {'Random Forest': RandomForestRegressor(),'Xgboost': XGBRegressor(),'Ridge': Ridge(),'Kneighbors': KNeighborsRegressor(),'Linear Regressor': linear_model.LinearRegression()}

# This function return a map that maps between each model and its Regressor class.
def get_models():
    result_map = {}
    for key,val in models_map.items():
        result = Regressor(key)
        reg = val
        result.score.cross_vali_score = np.mean(cross_val_score(reg,X_train,cv=5))        
        result.reg = reg.fit(X_train,y_train) 
        result.y_pred = reg.predict(X_test) 
        result.score.r2 = r2_score(y_test,result.y_pred) 
        result_map[key] = result 
    return result_map

# This function print a graph for models of the features that influenced their decision making.
def print_influence_graph(map):
    for key,val in map.items():
        if key == 'Random Forest' or key == 'Xgboost':
            # The parameters that most influenced the decision.
            feature_imp = pd.Series(val.reg.feature_importances_,index=features).sort_values(ascending=False)
            sns.barplot(x=feature_imp,y=feature_imp.index)
            # Add labels to your graph
            plt.xlabel('Feature Importance score')
            plt.ylabel('Features')
            plt.title(val.name.upper() +" - Visualizing Important Features")
            plt.show()

# This function print a graph for models that show the real results against the model predictions.
def show_predicted_vs_actual(map):
    for key,val in map.items():
        fig,ax = plt.subplots()
        ax.scatter(y_test,val.y_pred,edgecolors=(0,1))
        ax.plot([y_test.min(),y_test.max()],[y_test.min(),'r--',lw=3)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.title.set_text(val.name.upper() +" - Predicted time vs actual time")
        plt.show()

# This function print numerical scores for the models.
def print_scores(map):
    for key,val in map.items():
        print(val.name.upper() + ' score: ')
        print('R2score' + '  =  ',val.score.r2)
        print('Cross_val_score' + '  =  ',val.score.cross_vali_score)
        print('------------------------------------------\n')

# This function print a graph showing the differences between the scores of the models.
def show_models_differences_graph(map):
    comp_df = pd.DataFrame(columns = ('Method','R2 score','Cross val score'))
    for i in map:
        row = {'Method': i,'R2 score': map[i].score.r2,'Cross val score': map[i].score.cross_vali_score}
        comp_df = comp_df.append(row,ignore_index=True)
    ax = comp_df.plot.bar(x='Method',rot=30,figsize=(12,6))
    ax.set_title('Comparison graph')   
#########################################################################################################################################

models = get_models()
print_influence_graph(models)
show_predicted_vs_actual(models)
print_scores(models)
show_models_differences_graph(models)

结果如下:

enter image description here

enter image description here

enter image description here

enter image description here

enter image description here

enter image description here

enter image description here

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)