问题描述
我有一个包含时间日期和分组条件NEL_Hotspots
的表。
我正在尝试根据以下规则总结表格:
观察结果按NEL_Hotspots
分组,然后将同一天(24h)内的所有观察结果分组,并且Wind_direc
的范围在+ -10之内。
这是较大表的一小部分:
structure(list(Serial_number = c(10,8,9,20,21,23,3,5,7,11,13,24),Date_time = c("3/31/05 1:57","3/31/05 4:12","3/31/05 18:12","4/1/05 2:12","4/1/05 3:12","4/3/05 16:12","3/28/05 9:57","3/30/05 13:42","3/31/05 1:57","4/10/05 10:57","4/10/05 18:57","4/10/05 20:13","4/10/05 21:30"),Wind_direc = c(50,60,70,140,50,270,300,310,290),NEL_Hotspots = c(0,1,1),Dust_Intens = c("weak","weak","medium","high"),Area_km2 = c(290,241,225,240,340,320,176,143,211,72,171,167,121)),.Names = c("Serial_number","Date_time","Wind_direc","NEL_Hotspots","Dust_Intens","Area_km2"
),class = c("spec_tbl_df","tbl_df","tbl","data.frame"),row.names = c(NA,-13L),spec = structure(list(cols = structure(list(Serial_number = structure(list(),class = c("collector_double","collector")),Date_time = structure(list(),class = c("collector_character",Wind_direc = structure(list(),NEL_Hotspots = structure(list(),Dust_Intens = structure(list(),Area_km2 = structure(list(),"collector"))),"Area_km2")),default = structure(list(),class = c("collector_guess",skip = 1),.Names = c("cols","default","skip"
),class = "col_spec"))
一旦加载了数据,我就使用df <- df %>% mutate(full_date = ymd_hms(Date_time))
中的lubridate
来创建列full_date
。
预期输出为:
structure(list(`First Date_time` = c("3/31/05 1:57","4/10/05 18:57"),`Last Date_time` = c("3/31/05 4:12","4/10/05 21:30"
),Wind_direc_avg = c(55,300),wind_direc_min = c(50,wind_direc_max = c(60,310),Dust_Intens = c("weak,weak","weak,medium","medium,medium,high"),Area_km2_avg = c(265.5,290,153),Area_km2_stdv = c(34.64,70.71,27.78),events_count = c(2,2,3),serial_numbers = c("10,8","9","20,21","23","3","5","7","11","13,24")),.Names = c("First Date_time","Last Date_time","Wind_direc_avg","wind_direc_min","wind_direc_max","Area_km2_avg","Area_km2_stdv","events_count","serial_numbers"),-9L),spec = structure(list(
cols = structure(list(`First Date_time` = structure(list(),`Last Date_time` = structure(list(),Wind_direc_avg = structure(list(),wind_direc_min = structure(list(),wind_direc_max = structure(list(),Area_km2_avg = structure(list(),Area_km2_stdv = structure(list(),events_count = structure(list(),serial_numbers = structure(list(),"serial_numbers")),class = "col_spec"))
我将不胜感激!
解决方法
尝试根据您的情况创建组。在-
时创建一个新组- 日期更改
- Wind中每+10的值更改
对于每个组,请计算您要在summarise
中使用的所有统计信息
library(dplyr)
df %>%
mutate(Date_time = lubridate::mdy_hm(Date_time),date = as.Date(Date_time)) %>%
group_by(date) %>%
group_by(val = lag(ceiling((Wind_direc - first(Wind_direc))/10),default = 0),.add = TRUE) %>%
summarise(first_date_time = first(Date_time),last_date_time = last(Date_time),Wind_direc_avg = mean(Wind_direc),Wind_direc_min = min(Wind_direc),Wind_direc_max = max(Wind_direc),NEL_Hotspots = sum(NEL_Hotspots),Dust_Intens = toString(Dust_Intens),Area_km2_avg = mean(Area_km2))