问题描述

我正在关注这篇论文： http://robotics.stanford.edu/users/shoham/www%20papers/Empirical%20Hardness.pdf

我尝试在黑盒求解器上预测旅行商问题的运行时间。

我在回归过程中得到了一些我很想咨询的奇怪结果：

我很难相信在 XGBOOST 或任何回归器中，城市数量与特征无关？如 XGBOOST 特征重要性图像所示。
在 RIDGE 和 LINEAR REGRESSION 结果图中，您可以看到对于某些问题实例，图中您可以看到预测值为负（当我们谈论运行时间时），我在这里的其他问题中看到这是因为“线性回归不尊重 0" 的界限，我应该在它上面放一个自然对数，但我不知道确切的位置。所以我也很乐意帮忙。
我也很乐意被推荐使用其他可能适合我的问题的回归模型。

非常感谢！

这是我的代码片段（google colab），然后是我得到的结果：

1 # 导入pandas的标准库。

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
from google.colab import files

2 # 安装求解器并导入它的库，另外导入所有的 # 个库，我们将用它来准备功能。

!pip3 install ortools 
!pip install python-igraph

from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp

import numpy as np
import time
import random 
from random import randrange
from scipy import stats
from scipy.stats import skew
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import depth_first_tree
from igraph import Graph,mean
import igraph
import itertools
import math

3 # 城市之间的简单旅行商问题 - 求解器或 Google 工具。

def create_data_model():
    # Stores the data for the problem. 
    data = {}
    # dim will be the number of Vertices\Cities in the Traveling Salesman Problem.
    # Randomly select the matrix dimension in unifom distribution.
    dim = np.random.randint(10,350) 
    # Generate a square symmetric matrix It will be the distance matrix that the solver will solve.
    square_matrice = [[0 for row in range(dim)] for col in range(dim)]
    for i in range(dim):
        for j in range(dim):
            if i == j:
                square_matrice[i][j] = 0
            else:
                # Randomly fill the matrix in unifom distribution.
                square_matrice[i][j] = square_matrice[j][i] = np.random.randint(1,1000) 
    data['distance_matrix'] = square_matrice # yapf: disable
    data['num_vehicles'] = 1
    data['depot'] = 0
    return data


def main():
    # Start measuring solution time.
    start_time = time.time()
    # Instantiate the data problem.
    data = create_data_model()
    # Create the routing index manager.
    manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),data['num_vehicles'],data['depot'])
    # Create Routing Model.
    routing = pywrapcp.RoutingModel(manager)
    def distance_callback(from_index,to_index):
        # Returns the distance between the two nodes. 
        # Convert from routing variable Index to distance matrix NodeIndex.
        from_node = manager.IndexToNode(from_index)
        to_node = manager.IndexToNode(to_index)
        return data['distance_matrix'][from_node][to_node]
    transit_callback_index = routing.RegisterTransitCallback(distance_callback)
    # Define cost of each arc.
    routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
    # Setting first solution heuristic.
    search_parameters = pywrapcp.DefaultRoutingSearchParameters()
    search_parameters.first_solution_strategy = (
        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEApest_ARC)
    # Solve the problem.
    solution = routing.solveWithParameters(search_parameters)
    solution_time = time.time() - start_time

    '''In this part of the code we will create the following features on the distance matrix of the problem.
    *  Mean - Average weights of the distance matrix.
    *  Std - Standard Deviation of the distance matrix.
    *  Skewness  - What is the tendency of the weights in the distance matrix.
    *  Noc - Number of cities we have in the distance matrix [matrix dimension].
    *  Td - The total distance of the solution rout. 
    *  Dmft - distance matrix features time,That is how long it took us to calculate all these features.
    '''
    dmt_start_time = time.time()
    mat = np.array(data['distance_matrix'])
    mean = mat.mean()
    std = mat.std()
    merged = list(itertools.chain(*mat))
    skewness = skew(merged)
    noc = len(data['distance_matrix'])
    td = solution.ObjectiveValue() if solution else -1
    dmft = time.time() - dmt_start_time

    '''In this part of the code we will create from the distance matrix of the problem an MST and than
    on the MST we take the following features.
    *  MST_Mean - Average weights of the MST.
    *  MST_Std - Standard Deviation of the MST.
    *  MST_Skewness  - What is the tendency of the weights in the MST.
    *  MST_ft - MST features time,That is how long it took us to calculate the MST & all these features.
    '''
    spt_start_time = time.time()
    X = csr_matrix(mat)
    Tcsr = minimum_spanning_tree(X)
    mat_st = np.array(Tcsr.toarray().astype(int))
    mst_mean = mat_st.mean()
    mst_std = mat_st.std()
    merged_st = list(itertools.chain(*mat_st))
    mst_skewness = skew(merged_st)
    mst_ft = time.time() - spt_start_time

    '''In this part of the code we calculate features from the MST that are considered to be
    related to the rank and depth of the tracks in it.
    *  D_Mean - Average degree of the MST.
    *  D_Std - Standard Deviation of the MST degrees.
    *  D_Skewness  - What is the tendency of the degrees in the MST.
    *  DFT_Mean - The average weight of the deepest track in MST.
    *  DFT_Std - Standard Deviation of the deepest track in MST.
    *  DFT_Max - The heaviest arch on the longest route in MST.
    *  DDFT_ft - Degree & DFT features time,That is how long it took us to calculate all these features.
    '''
    dstt_start_time = time.time()
    g = Graph.Weighted_Adjacency(mat_st.tolist())
    d_mean = igraph.statistics.mean(g.degree())
    d_std = igraph.statistics.sd(g.degree())
    d_skewness = skew(g.degree())
    d_t = depth_first_tree(X,directed=False)
    mat_dt = np.array(d_t.toarray().astype(int))
    dft_mean = mat_dt.mean()
    dft_std = mat_dt.std()
    dft_max = np.amax(mat_dt)
    ddft_ft = time.time() - dstt_start_time

    # In this map we will hold all the features and their results.
    features_map = {'Mean': mean,'Std': std,'Skewness': skewness,'Noc': noc,'Td': td,'Dmft': dmft,'MST_Mean': mst_mean,'MST_Std': mst_std,'MST_Skewness': mst_skewness,'MST_ft': mst_ft,'D_Mean': d_mean,'D_Std': d_std,'D_Skewness': d_skewness,'DFT_Mean': dft_mean,'DFT_Std': dft_std,'DFT_Max': dft_max,'DDFT_ft': ddft_ft,'Solution_time': solution_time}

    return features_map

# Main    
# Create dataFrame.
data_TSP = pd.DataFrame()
# Fill the dataFrame.
for i in range(10000):
    #print(i)
    features_map = main()
    data_TSP = data_TSP.append(features_map,ignore_index=True) 
# Show data frame.

data_TSP.head()
data_TSP.to_csv('data_10000.csv')
files.download('data_10000.csv')

回归模型：

# Import the standard libraries of pandas.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')

2 # 需要打开驱动器中的数据文件。

from google.colab import files
uploaded = files.upload()
import io

df = pd.read_csv(io.BytesIO(uploaded['data_10000_clean.csv']))
try:
    df.drop(['Unnamed: 0'],axis=1,inplace=True) 
except: 
    pass
df.head()

从 sklearn.model_selection 导入 train_test_split

# Split the data to training set and test set (70%,30%)
features = list(df.drop('Solution_time',axis = 1,inplace = False))
y = df['Solution_time']
X = df[features]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

导入我们用解决方案预测的模型，

以及评估这些模型的评分方法。

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import gridsearchcv
from sklearn import linear_model
!pip3 install xgboost
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
!pip install scikit-plot
import scikitplot as skplt
import matplotlib as mpl

########################################## Several functions for different regression  models. ##########################################

class score:

    r2 = 0.0                   # This score determines how close the predictions really are to the real data.
    cross_vali_score = 0.0     # How true is our algorithm if it going to well predict new data.

class Regressor:

    def __init__(self,name):
        self.name = name
        self.score = score()
        self.y_pred = None
        self.reg = None

# Map between the name of a model and the model itself.
models_map = {'Random Forest': RandomForestRegressor(),'Xgboost': XGBRegressor(),'Ridge': Ridge(),'Kneighbors': KNeighborsRegressor(),'Linear Regressor': linear_model.LinearRegression()}

# This function return a map that maps between each model and its Regressor class.
def get_models():
    result_map = {}
    for key,val in models_map.items():
        result = Regressor(key)
        reg = val
        result.score.cross_vali_score = np.mean(cross_val_score(reg,X_train,cv=5))        
        result.reg = reg.fit(X_train,y_train) 
        result.y_pred = reg.predict(X_test) 
        result.score.r2 = r2_score(y_test,result.y_pred) 
        result_map[key] = result 
    return result_map

# This function print a graph for models of the features that influenced their decision making.
def print_influence_graph(map):
    for key,val in map.items():
        if key == 'Random Forest' or key == 'Xgboost':
            # The parameters that most influenced the decision.
            feature_imp = pd.Series(val.reg.feature_importances_,index=features).sort_values(ascending=False)
            sns.barplot(x=feature_imp,y=feature_imp.index)
            # Add labels to your graph
            plt.xlabel('Feature Importance score')
            plt.ylabel('Features')
            plt.title(val.name.upper() +" - Visualizing Important Features")
            plt.show()

# This function print a graph for models that show the real results against the model predictions.
def show_predicted_vs_actual(map):
    for key,val in map.items():
        fig,ax = plt.subplots()
        ax.scatter(y_test,val.y_pred,edgecolors=(0,1))
        ax.plot([y_test.min(),y_test.max()],[y_test.min(),'r--',lw=3)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('Actual')
        ax.title.set_text(val.name.upper() +" - Predicted time vs actual time")
        plt.show()

# This function print numerical scores for the models.
def print_scores(map):
    for key,val in map.items():
        print(val.name.upper() + ' score: ')
        print('R2score' + '  =  ',val.score.r2)
        print('Cross_val_score' + '  =  ',val.score.cross_vali_score)
        print('------------------------------------------\n')

# This function print a graph showing the differences between the scores of the models.
def show_models_differences_graph(map):
    comp_df = pd.DataFrame(columns = ('Method','R2 score','Cross val score'))
    for i in map:
        row = {'Method': i,'R2 score': map[i].score.r2,'Cross val score': map[i].score.cross_vali_score}
        comp_df = comp_df.append(row,ignore_index=True)
    ax = comp_df.plot.bar(x='Method',rot=30,figsize=(12,6))
    ax.set_title('Comparison graph')   
#########################################################################################################################################

models = get_models()
print_influence_graph(models)
show_predicted_vs_actual(models)
print_scores(models)
show_models_differences_graph(models)