回归以获得TSP问题的经验硬度

问题描述

我正在尝试实现他们在本文中所做的： http://robotics.stanford.edu/users/shoham/www%20papers/Empirical%20Hardness.pdf 关于跟踪销售员问题。

我说的是这个问题的经典版本，城市用顶点表示，距离是边的权重，图是一个完整的图。

我正在努力让我的模型根据我能想到的所有特征预测运行时间，例如：城市数量、权重标准、权重的均值和模式以及总距离tour（即使它并不真正符合特征，因为它不容易计算）。但出于某种原因，我的模型在较高的权重范围（1-100、1-1000）上表现不佳。

如果您有任何帮助或建议，我将不胜感激。这是我的代码（我运行它 google colab）：数据创建者：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')
import time
import random
from random import randrange
from google.colab import files

!pip3 install ortools

"""Simple travelling salesman problem between cities."""
from ortools.constraint_solver import routing_enums_pb2
from ortools.constraint_solver import pywrapcp
from scipy import stats
import itertools
import math

def create_data_model():
    """Stores the data for the problem."""
    data = {}
    # dim will be the number of Vertices\Cities in the Traveling Salesman Problem
    #dim = int(round(np.random.normal(10,100)))  # Fill acoording to the task,normal distr
    dim = np.random.randint(160,320) # Fill acoording to the task,unifom distr

    square_matrice = [[0 for row in range(dim)] for col in range(dim)]
    for i in range(dim):
        for j in range(dim):
            if i == j:
                square_matrice[i][j] = 0
            else:
                #square_matrice[i][j] = square_matrice[j][i] = int(round(np.random.normal(10,50)))  # Fill acoording to the task,normal distr
                square_matrice[i][j] = square_matrice[j][i] = np.random.randint(1,1000) # Fill acoording to the task,unifom distr

    data['distance_matrix'] = square_matrice # yapf: disable
    data['num_vehicles'] = 1
    data['depot'] = 0
    mat = np.array(square_matrice)
    mean = mat.mean()
    standard_deviation = mat.std()
    merged = list(itertools.chain(*mat))
    mode = stats.mode(merged)[0][0]

    return data,mean,standard_deviation,mode

def main():
    start_time = time.time()
    """Entry point of the program."""
    # Instantiate the data problem.
    data,mode = create_data_model()

    # Create the routing index manager.
    manager = pywrapcp.RoutingIndexManager(len(data['distance_matrix']),data['num_vehicles'],data['depot'])
    # Create Routing Model.
    routing = pywrapcp.RoutingModel(manager)

    def distance_callback(from_index,to_index):
        """Returns the distance between the two nodes."""
        # Convert from routing variable Index to distance matrix NodeIndex.
        from_node = manager.IndexToNode(from_index)
        to_node = manager.IndexToNode(to_index)
        return data['distance_matrix'][from_node][to_node]

    transit_callback_index = routing.RegisterTransitCallback(distance_callback)

    # Define cost of each arc.
    routing.SetArcCostEvaluatorOfAllVehicles(transit_callback_index)

    # Setting first solution heuristic.
    search_parameters = pywrapcp.DefaultRoutingSearchParameters()
    search_parameters.first_solution_strategy = (
        routing_enums_pb2.FirstSolutionStrategy.PATH_CHEApest_ARC)

    # Solve the problem.
    solution = routing.solveWithParameters(search_parameters)

    # Prepare the features and runtime it took for each instant.
    calc_time = time.time() - start_time
    number_of_places = len(data['distance_matrix'])
    total_distance = solution.ObjectiveValue() if solution else -1
    return calc_time,number_of_places,total_distance,mode

# Main
# Create dataFrame.
data_TSP = pd.DataFrame(columns = ('num_of_places','total_distance','mean','standard_deviation','mode','calc_time'))
# Fill the dataFrame.
for i in range(5000):
    calc_time,mode = main()
    row = {'num_of_places': number_of_places,'total_distance': total_distance,'mean': mean,'standard_deviation': standard_deviation,'mode': mode,'calc_time': calc_time}
    #print(calc_time,mode)
    data_TSP = data_TSP.append(row,ignore_index=True)

data_TSP

data_TSP.to_csv('data_TSP_80V_160V_1m_1000m.csv')
files.download('data_TSP_80V_160V_1m_1000m.csv')

模型预测和结果展示：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings. filterwarnings("ignore")
sns.set_style('whitegrid')

# Neaded for opening data file in drive.
from google.colab import files
uploaded = files.upload()
import io

df = pd.read_csv(io.BytesIO(uploaded['cleaned_data (1).csv']))
try:
    df.drop('Unnamed: 0',axis=1,inplace=True)
except:
    pass
df.head()

from sklearn.model_selection import train_test_split

# Split the data to training set and test set (70%,30%)
features = list(df.drop('calc_time',axis = 1,inplace = False))
y = df["calc_time"]
X = df[features]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3)

from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import gridsearchcv
from sklearn import linear_model

from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
!pip install scikit-plot
import scikitplot as skplt
import matplotlib as mpl

!pip3 install xgboost


########################################## Several functions for different regression  models. ##########################################

class score:
    r2 = 0.0  # This score determines how close the predictions really are to the real data.
    cross_vali_score = 0.0  # How true is our algorithm if it going to well predict new data.


class Regressor:

    def __init__(self,name):
        self.name = name
        self.score = score()
        self.ph = ParamHolder()
        self.y_pred = None
        self.clf = None


# This map will map between each model and all its results,# its predictions and its scores and the parameters we used for it.
models_map = {}

''' This function gets a string of model name and returns the model after it has
 selected the best parameters for it and run it on the train and tested its results
 against the test and of course kept all its scores.'''


def get_model(criterion):
    result = Regressor(criterion)
    if criterion == 'friedman_mse' or criterion == 'poisson':
        # result.ph = best_tree_params(criterion)
        #  clf = DecisionTreeRegressor(criterion=criterion,**result.ph.best_p)
        clf = DecisionTreeRegressor(criterion=criterion,**result.ph.best_p)
    elif criterion == 'random forest':
        # result.ph = best_forest_params()
        # clf = RandomForestRegressor(**result.ph.best_p)
        clf = RandomForestRegressor(n_estimators=100)
    elif criterion == 'xgboost':
        clf = XGBRegressor()
        # result.ph = best_xgboost_params()
        # clf = XGBRegressor(**result.ph.best_p)
    elif criterion == 'ridge':
        clf = Ridge(alpha=1,fit_intercept=False,normalize=False,copy_X=True,max_iter=10000,tol=0.001,solver='lsqr',random_state=4)
        # result.ph = best_etc_params()
        # clf = Ridge(**result.ph.best_p)
    elif criterion == 'kneighbors':
        # result.ph = best_knn_params()
        # clf = KNeighborsRegressor(**result.ph.best_p)
        clf = KNeighborsRegressor()
    elif criterion == 'PoissonRegressor':
        clf = linear_model.LinearRegression()

    result.score.cross_vali_score = np.mean(cross_val_score(clf,X_train,cv=5))
    result.clf = clf.fit(X_train,y_train)
    result.y_pred = clf.predict(X_test)
    result.score.r2 = r2_score(y_test,result.y_pred)
    return result


def print_influence_graph(regressor):
    # The parameters that most influenced the decision. * Part D on this algo
    feature_imp = pd.Series(regressor.clf.feature_importances_,index=features).sort_values(ascending=False)
    sns.barplot(x=feature_imp,y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance score')
    plt.ylabel('Features')
    plt.title(regressor.name.upper() + " - Visualizing Important Features")
    plt.show()


def show_predicted_vs_actual(regressor):
    fig,ax = plt.subplots()
    ax.scatter(y_test,regressor.y_pred,edgecolors=(0,1))
    ax.plot([y_test.min(),y_test.max()],[y_test.min(),'r--',lw=3)
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    plt.show()


def print_scores(regressor):
    print('R2score for ' + regressor.name.upper() + ' = ',regressor.score.r2)
    print('Cross_val_score for ' + regressor.name.upper() + ' = ',regressor.score.cross_vali_score)

#########################################################################################################################################

models_map['random forest'] = reg_rf = get_model('random forest')

# The parameters that most influenced the decision.
print_influence_graph(reg_rf)

show_predicted_vs_actual(reg_rf)

print_scores(reg_rf)

models_map['ridge'] = reg_ridge = get_model('ridge')

show_predicted_vs_actual(reg_ridge)

print_scores(reg_ridge)

from xgboost import XGBRegressor

models_map['xgboost'] = reg_xgb = get_model('xgboost')

print_influence_graph(reg_xgb)

show_predicted_vs_actual(reg_xgb)

print_scores(reg_xgb)

models_map['PoissonRegressor'] = clf_gnb = get_model('PoissonRegressor')

show_predicted_vs_actual(clf_gnb)

print_scores(clf_gnb)

models_map['kneighbors'] = clf_kn = get_model('kneighbors')

show_predicted_vs_actual(clf_kn)

print_scores(clf_kn)

comp_df = pd.DataFrame(columns = ('Method','R2 score','Cross val score'))

for i in models_map:
    row = {'Method': i,'R2 score': models_map[i].score.r2,'Cross val score': models_map[i].score.cross_vali_score}
    comp_df = comp_df.append(row,ignore_index=True)

comp_df

ax = comp_df.plot.bar(x='Method',rot=30,figsize=(12,6))
ax.set_title('Comparison graph')

非常感谢！

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

machine-learning regression