问题描述
我正在关注这篇论文: http://robotics.stanford.edu/users/shoham/www%20papers/Empirical%20Hardness.pdf
我尝试在黑盒求解器上预测旅行商问题的运行时间。
我在回归过程中得到了一些我很想咨询的奇怪结果:
- 我很难相信在 XGBOOST 或任何回归器中,城市数量与特征无关?如 XGBOOST 特征重要性图像所示。
- 在 RIDGE 和 LINEAR REGRESSION 结果图中,您可以看到对于某些问题实例,图中您可以看到预测值为负(当我们谈论运行时间时),我在这里的其他问题中看到这是因为“线性回归不尊重 0" 的界限,我应该在它上面放一个自然对数,但我不知道确切的位置。所以我也很乐意帮忙。
- 我也很乐意被推荐使用其他可能适合我的问题的回归模型。
非常感谢!
这是我的代码片段(google colab),然后是我得到的结果:
1 # 导入pandas的标准库。
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
from google.colab import files
2 # 安装求解器并导入它的库,另外导入所有的 # 个库,我们将用它来准备功能。
!pip3 install ortools
!pip install python-igraph
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
import numpy as np
import time
import random
from random import randrange
from scipy import stats
from scipy.stats import skew
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from scipy.sparse.csgraph import depth_first_tree
from igraph import Graph,mean
import igraph
import itertools
import math
3 # 城市之间的简单旅行商问题 - 求解器或 Google 工具。
def create_data_model():
# Stores the data for the problem.
data = {}
# dim will be the number of Vertices\Cities in the Traveling Salesman Problem.
# Randomly select the matrix dimension in unifom distribution.
dim = np.random.randint(10,350)
# Generate a square symmetric matrix It will be the distance matrix that the solver will solve.
square_matrice = [[0 for row in range(dim)] for col in range(dim)]
for i in range(dim):
for j in range(dim):
if i == j:
square_matrice[i][j] = 0
else:
# Randomly fill the matrix in unifom distribution.
square_matrice[i][j] = square_matrice[j][i] = np.random.randint(1,1000)
data['distance_matrix'] = square_matrice # yapf: disable
data['num_vehicles'] = 1
data['depot'] = 0
return data
def main():
# Start measuring solution time.
start_time = time.time()
# Instantiate the data problem.
data = create_data_model()
# Create the routing index manager.
manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),data['num_vehicles'],data['depot'])
# Create Routing Model.
routing = pywrapcp.RoutingModel(manager)
def distance_callback(from_index,to_index):
# Returns the distance between the two nodes.
# Convert from routing variable Index to distance matrix NodeIndex.
from_node = manager.IndexToNode(from_index)
to_node = manager.IndexToNode(to_index)
return data['distance_matrix'][from_node][to_node]
transit_callback_index = routing.RegisterTransitCallback(distance_callback)
# Define cost of each arc.
routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)
# Setting first solution heuristic.
search_parameters = pywrapcp.DefaultRoutingSearchParameters()
search_parameters.first_solution_strategy = (
routing_enums_pb2.FirstSolutionStrategy.PATH_CHEApest_ARC)
# Solve the problem.
solution = routing.solveWithParameters(search_parameters)
solution_time = time.time() - start_time
'''In this part of the code we will create the following features on the distance matrix of the problem.
* Mean - Average weights of the distance matrix.
* Std - Standard Deviation of the distance matrix.
* Skewness - What is the tendency of the weights in the distance matrix.
* Noc - Number of cities we have in the distance matrix [matrix dimension].
* Td - The total distance of the solution rout.
* Dmft - distance matrix features time,That is how long it took us to calculate all these features.
'''
dmt_start_time = time.time()
mat = np.array(data['distance_matrix'])
mean = mat.mean()
std = mat.std()
merged = list(itertools.chain(*mat))
skewness = skew(merged)
noc = len(data['distance_matrix'])
td = solution.ObjectiveValue() if solution else -1
dmft = time.time() - dmt_start_time
'''In this part of the code we will create from the distance matrix of the problem an MST and than
on the MST we take the following features.
* MST_Mean - Average weights of the MST.
* MST_Std - Standard Deviation of the MST.
* MST_Skewness - What is the tendency of the weights in the MST.
* MST_ft - MST features time,That is how long it took us to calculate the MST & all these features.
'''
spt_start_time = time.time()
X = csr_matrix(mat)
Tcsr = minimum_spanning_tree(X)
mat_st = np.array(Tcsr.toarray().astype(int))
mst_mean = mat_st.mean()
mst_std = mat_st.std()
merged_st = list(itertools.chain(*mat_st))
mst_skewness = skew(merged_st)
mst_ft = time.time() - spt_start_time
'''In this part of the code we calculate features from the MST that are considered to be
related to the rank and depth of the tracks in it.
* D_Mean - Average degree of the MST.
* D_Std - Standard Deviation of the MST degrees.
* D_Skewness - What is the tendency of the degrees in the MST.
* DFT_Mean - The average weight of the deepest track in MST.
* DFT_Std - Standard Deviation of the deepest track in MST.
* DFT_Max - The heaviest arch on the longest route in MST.
* DDFT_ft - Degree & DFT features time,That is how long it took us to calculate all these features.
'''
dstt_start_time = time.time()
g = Graph.Weighted_Adjacency(mat_st.tolist())
d_mean = igraph.statistics.mean(g.degree())
d_std = igraph.statistics.sd(g.degree())
d_skewness = skew(g.degree())
d_t = depth_first_tree(X,directed=False)
mat_dt = np.array(d_t.toarray().astype(int))
dft_mean = mat_dt.mean()
dft_std = mat_dt.std()
dft_max = np.amax(mat_dt)
ddft_ft = time.time() - dstt_start_time
# In this map we will hold all the features and their results.
features_map = {'Mean': mean,'Std': std,'Skewness': skewness,'Noc': noc,'Td': td,'Dmft': dmft,'MST_Mean': mst_mean,'MST_Std': mst_std,'MST_Skewness': mst_skewness,'MST_ft': mst_ft,'D_Mean': d_mean,'D_Std': d_std,'D_Skewness': d_skewness,'DFT_Mean': dft_mean,'DFT_Std': dft_std,'DFT_Max': dft_max,'DDFT_ft': ddft_ft,'Solution_time': solution_time}
return features_map
# Main
# Create dataFrame.
data_TSP = pd.DataFrame()
# Fill the dataFrame.
for i in range(10000):
#print(i)
features_map = main()
data_TSP = data_TSP.append(features_map,ignore_index=True)
# Show data frame.
data_TSP.head()
data_TSP.to_csv('data_10000.csv')
files.download('data_10000.csv')
回归模型:
# Import the standard libraries of pandas.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
2 # 需要打开驱动器中的数据文件。
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['data_10000_clean.csv']))
try:
df.drop(['Unnamed: 0'],axis=1,inplace=True)
except:
pass
df.head()
从 sklearn.model_selection 导入 train_test_split
# Split the data to training set and test set (70%,30%)
features = list(df.drop('Solution_time',axis = 1,inplace = False))
y = df['Solution_time']
X = df[features]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)
导入我们用解决方案预测的模型,
以及评估这些模型的评分方法。
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import gridsearchcv
from sklearn import linear_model
!pip3 install xgboost
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
!pip install scikit-plot
import scikitplot as skplt
import matplotlib as mpl
########################################## Several functions for different regression models. ##########################################
class score:
r2 = 0.0 # This score determines how close the predictions really are to the real data.
cross_vali_score = 0.0 # How true is our algorithm if it going to well predict new data.
class Regressor:
def __init__(self,name):
self.name = name
self.score = score()
self.y_pred = None
self.reg = None
# Map between the name of a model and the model itself.
models_map = {'Random Forest': RandomForestRegressor(),'Xgboost': XGBRegressor(),'Ridge': Ridge(),'Kneighbors': KNeighborsRegressor(),'Linear Regressor': linear_model.LinearRegression()}
# This function return a map that maps between each model and its Regressor class.
def get_models():
result_map = {}
for key,val in models_map.items():
result = Regressor(key)
reg = val
result.score.cross_vali_score = np.mean(cross_val_score(reg,X_train,cv=5))
result.reg = reg.fit(X_train,y_train)
result.y_pred = reg.predict(X_test)
result.score.r2 = r2_score(y_test,result.y_pred)
result_map[key] = result
return result_map
# This function print a graph for models of the features that influenced their decision making.
def print_influence_graph(map):
for key,val in map.items():
if key == 'Random Forest' or key == 'Xgboost':
# The parameters that most influenced the decision.
feature_imp = pd.Series(val.reg.feature_importances_,index=features).sort_values(ascending=False)
sns.barplot(x=feature_imp,y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance score')
plt.ylabel('Features')
plt.title(val.name.upper() +" - Visualizing Important Features")
plt.show()
# This function print a graph for models that show the real results against the model predictions.
def show_predicted_vs_actual(map):
for key,val in map.items():
fig,ax = plt.subplots()
ax.scatter(y_test,val.y_pred,edgecolors=(0,1))
ax.plot([y_test.min(),y_test.max()],[y_test.min(),'r--',lw=3)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.title.set_text(val.name.upper() +" - Predicted time vs actual time")
plt.show()
# This function print numerical scores for the models.
def print_scores(map):
for key,val in map.items():
print(val.name.upper() + ' score: ')
print('R2score' + ' = ',val.score.r2)
print('Cross_val_score' + ' = ',val.score.cross_vali_score)
print('------------------------------------------\n')
# This function print a graph showing the differences between the scores of the models.
def show_models_differences_graph(map):
comp_df = pd.DataFrame(columns = ('Method','R2 score','Cross val score'))
for i in map:
row = {'Method': i,'R2 score': map[i].score.r2,'Cross val score': map[i].score.cross_vali_score}
comp_df = comp_df.append(row,ignore_index=True)
ax = comp_df.plot.bar(x='Method',rot=30,figsize=(12,6))
ax.set_title('Comparison graph')
#########################################################################################################################################
models = get_models()
print_influence_graph(models)
show_predicted_vs_actual(models)
print_scores(models)
show_models_differences_graph(models)
结果如下:
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)