API 调用缓慢的问题

问题描述

我有一个非常大的 df（200k+ 行）OD（始发地-目的地）邮政编码，我想为其计算公共交通行程信息（即，腿数、总持续时间、总步行时间等））。我正在使用 TfL 的 API 来检索特定信息并已注册以获取我在下面的代码中使用的 app_key（这允许每分钟最多 500 次调用）。

下面的代码显示了一个长度为 3 的虚拟列表通过函数传递，并将结果整理生成一个名为 df 的 output（这只是为了说明）。

在函数中，我特别想保留成功调用的输出（即状态 = 200）并且不关心其他（这些是状态 = 500、504、300）并且刚刚为这些。

请注意，journey Planner 无法很好地处理伦敦郊区的邮政编码，并且会返回状态代码 = 300。这似乎是一个未解决的错误。请参阅此处的说明： https://techforum.tfl.gov.uk/t/journey-planner-lat-longs-causing-multiple-choices-redirect-300/298

library(httr)
library(jsonlite)
library(tidyverse)

# create df and turn this into a list,ready for input into the function below

a           <- data.frame(from                    = c("EN3 5JW","SW9 8HE","SE12 9TJ"),to                      = c("N17 9LP","BR7 5EB","NW10 1PH"),date                    = c("20210219","20210219","20210219"),time                    = c("0200","0245","0300"),timeIs                  = c("Departing","Departing","Departing"),journeyPreference       = c("LeastTime","LeastTime","LeastTime"),accessibilityPreference = c("norequirements","norequirements","norequirements"),walkingSpeed            = c("Slow","Slow","Slow"),cyclePreference         = c("None","None","None"),bikeProficiency         = c("Easy","Easy","Easy"))
b           <- split(a,seq(nrow(a)))



api_function <- function(from,to,date,time,timeIs,journeyPreference,accessibilityPreference,walkingSpeed,cyclePreference,bikeProficiency){
  
  tflpath <- "https://api.tfl.gov.uk/journey/journeyResults/{from}/to/{to}?date=[date]&time=[time]&timeIs=[timeIs]&journeyPreference=[journeyPreference]&mode=tube%2Cbus%2Coverground&accessibilityPreference=[accessibilityPreference]&walkingSpeed=[walkingSpeed]&cyclePreference=[cyclePreference]&bikeProficiency=[bikeProficiency]"
  
  request      <- GET(url      = tflpath,add_headers(app_key= "bdbaxxxxxxxxxxxxxxxxxxxxf5a9"),encoding = "UTF-8",query    = list(from                     =from,to                       =to,date                     =date,time                     =time,timeIs                   =timeIs,journeyPreference        =journeyPreference,accessibilityPreference  =accessibilityPreference,walkingSpeed             =walkingSpeed,cyclePreference          =cyclePreference,bikeProficiency          =bikeProficiency))
  
  #extract the status message - we want to skip over those without status = 300
  status_result <- http_status(request)$message
  
  if(status_result=="Success: (200) OK"){
   
    json_content <- fromJSON(content(request,"text",encoding = "UTF-8"),flatten = TRUE) 
    # get journey leg information
    x            <- flatten(json_content$journeys$legs)
    # get journey duration and mode information
    df1          <- data.frame(duration = x$duration,mode.id  = x$mode.id)
    # get origin and destination info (i.e. postcodes)
    od           <- as.data.frame(json_content$journeyVector)[,2:3]  
    df2 <- df1%>%
      group_by(mode.id)%>%
      summarise(legs = n(),total_dur = sum(duration))%>%
      mutate(tube_legs       = case_when(mode.id== 'tube'~ as.numeric(legs),mode.id== 'walking'~ 0,mode.id== 'bus'~ 0,mode.id== 'overground'~ 0),bus_legs        = case_when(mode.id== 'tube'~ 0,mode.id== 'overground'~ 0,mode.id== 'bus'~ as.numeric(legs)),walk_legs       = case_when(mode.id== 'tube'~ 0,mode.id== 'walking'~ as.numeric(legs),overground_legs = case_when(mode.id== 'tube'~ 0,mode.id== 'overground'~ as.numeric(legs)),walk_dur        = case_when(mode.id== 'walking' ~ as.numeric(total_dur),TRUE ~ 0))%>%
      ungroup()%>%
      select(-mode.id)%>%
      summarise(total_legs      = sum(legs),total_dur       = sum(total_dur),tube_legs       = sum(tube_legs),bus_legs        = sum(bus_legs),overground_legs = sum(overground_legs),walk_legs       = sum(walk_legs),walk_dur        = sum(walk_dur))%>%
      cbind(od)%>%
      cbind(min(x$departureTime),max(x$arrivalTime))%>%
      rename(dep_time = "min(x$departureTime)",arrival_time = "max(x$arrivalTime)")%>%
      select(from,dep_time,arrival_time,tube_legs,bus_legs,walk_legs,overground_legs,total_legs,walk_dur,total_dur)
    return(df2)
    
  }
  else{
    return(data.frame(from            = from,to              = to,dep_time        = "",arrival_time    = "",tube_legs       = "",bus_legs        = "",walk_legs       = "",overground_legs = "",total_legs      = "",walk_dur        = "",total_dur       = "")
    )
  }
}
       
test        <- vector('list',300000)
for (i in 1:length(b)){
  test[[i]] <- data.frame(do.call(api_function,b[[i]]))
}
output      <- do.call(rbind.data.frame,test)

output 应如下所示：

我遇到的主要问题是：

传递长度 >200k 的列表需要非常长的时间（每个成功的 GET() 最多需要 3 秒，不成功的 GET() 调用，即状态代码 500、504、300 最多需要 55秒）。

我在长度为 200 的虚拟列表上对此进行了测试，花了 12 分钟！

有什么办法可以加快速度吗？

解决方法

暂无找到可以解决该程序问题的有效方法，小编努力寻找整理中！

如果你已经找到好的解决方法，欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@）

get list list performance performance performance r r