大型数据集-无法轻松处理

问题描述

我正在尝试找到最接近特定纬度和经度的站点。

# import requests
import json



# import matplotlib.pyplot as plt
# import seaborn as sns

# from pandasdmx import Request
# from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import pandas as pd
import numpy as np
import geopandas as gpd


import geopy.distance


import time

import csv

# import sys,os
# from sqlalchemy import create_engine

# import pymssql
# import sqlalchemy as db

# from functools import reduce

# import folium # map rendering library
# from sklearn.neighbors import NearestNeighbors




property_data_set = pd.read_csv('prop_data_set.csv')

shape_file_trains = "./shapefile/ptv_metro_train_station.shp"

trains_shape = gpd.read_file(shape_file_trains)


mwarp=property_data_set


# cartesian product so we get all combinations
dfdist = (mwarp.assign(foo=1).merge(trains_shape.assign(foo=1),on="foo")
    # calc distance in km between each suburb and each train station
     .assign(km=lambda dfa: dfa.apply(lambda r: 
                                      geopy.distance.geodesic(
                                          (r["LATITUDE"],r["LONGITUDE"]),(r["lat"],r["lon"])).km,axis=1))
    # reduce number of columns to make it more digestable
     .loc[:,["postcode","address_street_full","STOP_ID","STOP_NAME","km"]]
    # sort so shortest distance station from a suburb is first
     .sort_values(["postcode","suburb","km"])
    # good practice
     .reset_index(drop=True)
)
# finally pick out stations nearest to suburb
# this can easily be joined back to source data frames as postcode and STOP_ID have been maintained
dfnearest = dfdist.groupby(["postcode","suburb"])\
    .agg({"STOP_ID":"first","STOP_NAME":"first","km":"first"}).reset_index()

# print(dfnearest.to_string(index=False))

dfnearest.to_csv("distances_station")
print(dfnearest)

这是我当前正在使用的代码。我的计算机正在努力进行此分析，因为有11,000行数据。我想批量处理它，但不确定100％采取哪种最佳方法。任何人都做过类似的事情并且了解最佳途径吗？谢谢

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

bigdata pandas pandas python