问题描述
我正在进行中断时间序列回归分析,以帮助查看特定事件后值是否存在显着的非零变化。这是两个模拟数据帧;一个具有日期和值,另一个具有事件名称和该事件的对应日期:-
#dataset 1
eventDate<-structure(c(18262,18263,18264,18265,18266,18267,18268,18269,18270,18271,18272,18273,18274,18275,18276,18277,18278,18279,18280,18281,18282,18283,18284,18285,18286,18287,18288,18289,18290,18291,18292,18293,18294,18295,18296,18297,18298,18299,18300,18301,18302,18303,18304,18305,18306,18307,18308,18309,18310,18311,18312,18313,18314,18315,18316,18317,18318,18319,18320,18321,18322,18323,18324,18325,18326,18327,18328,18329,18330,18331,18332,18333,18334,18335,18336,18337,18338,18339,18340,18341,18342,18343,18344,18345,18346,18347,18348,18349,18350,18351,18352,18353,18354,18355,18356,18357,18358,18359,18360,18361,18362,18363,18364,18365,18366,18367,18368,18369,18370,18371,18372,18373,18374,18375,18376,18377,18378,18379,18380,18381,18382,18383,18384,18385,18386,18387,18388,18389,18390,18391,18392,18393,18394,18395,18396,18397,18398,18399,18400,18401,18402,18403,18404,18405,18406,18407,18408,18409,18410,18411,18412,18413,18414,18415,18416,18417,18418,18419,18420,18421,18422,18423,18424,18425,18426,18427,18428,18429,18430,18431,18432,18433,18434,18435,18436,18437,18438,18439,18440,18441,18442,18443,18444,18445,18446,18447,18448,18449,18450,18451,18452,18453,18454,18455,18456,18457,18458,18459,18460,18461,18462,18463,18464,18465,18466,18467,18468,18469,18470,18471,18472,18473,18474,18475,18476,18477,18478,18479,18480,18481,18482,18483,18484,18485,18486,18487,18488,18489,18490,18491,18492,18493,18494,18495,18496,18497,18498,18499,18500,18501,18502,18503,18504,18505,18506,18507,18508,18509,18510,18511,18512,18513,18514,18515,18516,18517,18518,18519,18520,18521,18522,18523,18524,18525,18526,18527,18528,18529,18530,18531,18532,18533,18534,18535,18536,18537,18538,18539,18540,18541,18542,18543,18544,18545,18546,18547,18548,18549,18550,18551,18552,18553,18554,18555,18556,18557,18558,18559,18560,18561,18562,18563,18564,18565,18566,18567,18568,18569,18570,18571,18572,18573,18574,18575,18576,18577,18578,18579,18580,18581,18582,18583,18584,18585,18586,18587,18588,18589,18590,18591,18592,18593,18594,18595,18596,18597,18598,18599,18600,18601,18602,18603,18604,18605,18606,18607,18608,18609,18610,18611,18612,18613,18614,18615,18616,18617,18618,18619,18620,18621,18622,18623,18624,18625,18626,18627),class = "Date")
Count<-c(46L,58L,46L,60L,42L,56L,44L,48L,43L,50L,45L,55L,57L,47L,51L,59L,53L,52L,49L,122L,100L,91L,82L,54L,41L,40L,155L,123L,98L,90L,84L,71L,57L)
data<-data.frame(eventDate,Count)
#dataset 2
Event<-c("event_a","event_b","event_c","event_d","event_e","event_f","event_g")
Event_Date<-structure(c(18289,18547),class = "Date")
events_dates<-data.frame(Event,Event_Date)
以下是我正在做的两个示例。一个例子有一个显着的结果,一个例子有一个不显着的结果。
library(dplyr)
library(ggplot2)
#significant result
event_date<-as.Date("2020-09-01")
before_period<-as.Date(event_date)-21
after_period<-as.Date(event_date)+21
data_filtered<-data%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,1)
data_filtered <- data_filtered %>%
mutate(lag_count = lag(Count))
data_filtered
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),family = "poisson",data = data_filtered)
summary(fit)
data_filtered$fit <- exp(c(NA,predict(fit)))
data_filtered$fit2 = c(NA,predict(fit,type="response"))
data_filtered$Group<-ifelse(data_filtered$Post_event==0,"Pre-event","Post-event")
data_filtered$Group<-factor(data_filtered$Group,levels = c("Pre-event","Post-event"))
ggplot(data_filtered,aes(x=DayNumber,y = Count,colour=Group)) +
geom_line()+geom_smooth(method="lm",se=F,aes(colour=Group)) +ggtitle("Count before and after event")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
ggplot(data_filtered,aes(x = DayNumber,y = fit2)) +
geom_line() +
geom_smooth(method="lm",aes(colour=Group)) +
theme_bw() +
labs(colour="")+ggtitle("Count (Fitted); Method=lm")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
#non-significant result
event_date<-as.Date("2020-01-28")
before_period<-as.Date(event_date)-21
after_period<-as.Date(event_date)+21
data_filtered<-data%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,"Post-event"))
ggplot(data_filtered,aes(colour=Group)) +ggtitle("Count before and after event")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
每个 event_date
变量都来自 event_dates
数据框。
我有一个很长的日期列表,我想用这些日期进行分析,所以我需要一个函数来帮助我有效地执行此操作。我希望它返回回归拟合摘要和每个事件的两个图。这是我迄今为止的尝试(如您所知,不是一个好的尝试):-
#data1 represents data,data2 represents events_dates,n_days is number of days before and after event_date that I want to filter from data
TS_Intervention_Func<-function(data1,data2,n_days){
myresultslist<-list()
for (i in data2$Event_Date) {
event_date<-as.Date(i)
before_period<-as.Date(event_date)-n_days
after_period<-as.Date(event_date)+n_days
data_filtered<-data1%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,1)
data_filtered <- data_filtered %>%
mutate(lag_count = lag(Count))
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),data = data_filtered)
results<-summary(fit)
data_filtered$fit <- exp(c(NA,predict(fit)))
data_filtered$fit2 = c(NA,type="response"))
data_filtered$Group<-ifelse(data_filtered$Post_event==0,"Post-event")
data_filtered$Group<-factor(data_filtered$Group,"Post-event"))
plot_1<-ggplot(data_filtered,colour=Group)) +
geom_line()+geom_smooth(method="lm",aes(colour=Group)) +ggtitle("Count before and after event",paste0(event_date))+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
plot_2<-ggplot(data_filtered,y = fit2)) +
geom_line() +
geom_smooth(method="lm",aes(colour=Group)) +
theme_bw() +
labs(colour="")+ggtitle("Count (Fitted); Method=lm",linetype="dotted")+labs(caption = "Dotted line represents time of event")
myresultslist[[i]] <- do.call(results,plot1,plot_2)
}
return(myresultslist)
}
TS_Intervention_Func(data,events_dates,21)
#Error in as.Date.numeric(i) : 'origin' must be supplied
简而言之,我想让函数做三件事:-
- 从
event_dates
数据框中获取每个日期并在data
中的该日期运行分析迭代 - 将
fit
摘要和两个对应的图存储到该事件日期并将其保存在列表中(如果事件名称可以与它一起保存以便于查找,那就更好了) - 这是可取的;如果列表可以分成两部分,一个部分保存了重要的结果,另一部分保存了不重要的结果。
一个很大的问题,但一如既往地感谢任何帮助:)
解决方法
这不一定是最有效的方法,但我想让它保持简单易懂。
我稍微清理了您的代码并将其放入一个函数中。首先,我编写了一个处理一个事件日期的函数:
library(dplyr)
library(ggplot2)
reg_fun <- function(event_date,data) {
before_period <- as.Date(event_date) - 21
after_period <- as.Date(event_date) + 21
data_filtered <- data %>%
filter(eventDate >= as.Date(before_period) &
eventDate <= as.Date(after_period)) %>%
mutate(DayNumber = row_number()) %>%
mutate(intv_trend = cumsum(eventDate >= event_date)) %>%
mutate(Post_event = as.integer(!eventDate < event_date)) %>%
mutate(lag_count = lag(Count))
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),family = "poisson",data = data_filtered)
fit_summary <- summary(fit)
data_filtered$fit <- exp(c(NA,predict(fit)))
data_filtered$fit2 = c(NA,predict(fit,type = "response"))
data_filtered$Group <-
ifelse(data_filtered$Post_event == 0,"Pre-event","Post-event")
data_filtered$Group <-
factor(data_filtered$Group,levels = c("Pre-event","Post-event"))
plot1 <- ggplot(data_filtered,aes(x = DayNumber,y = Count,colour = Group)) +
geom_line() + geom_smooth(method = "lm",se = F,aes(colour = Group)) +
ggtitle("Count before and after event") +
geom_vline(xintercept = 22,linetype = "dotted") + labs(caption = "Dotted line represents time of event")
plot2 <- ggplot(data_filtered,y = fit2)) +
geom_line() +
geom_smooth(method = "lm",aes(colour = Group)) +
theme_bw() +
labs(colour = "") + ggtitle("Count (Fitted); Method=lm") +
geom_vline(xintercept = 22,linetype = "dotted") + labs(caption = "Dotted line represents time of event")
output <- list(
event_date = event_date,fit = fit,summary = fit_summary,plot1 = plot1,plot2 = plot2,signif = sum(coef(fit_summary)[,4] < 0.05) > 1 # see if any is significant (p-value for intercept is always < 0.05 so we want more than one significant value)
)
return(output)
}
现在我们可以用一个日期来测试这一点。正如你在上面看到的,输出是一个列表,我认为在这种情况下最合适。
test_res <- reg_fun(event_date = as.Date("2020-09-01"),data = data)
# print summary
test_res$summary
#>
#> Call:
#> glm(formula = Count ~ DayNumber + Post_event + intv_trend + log(lag_count),#> family = "poisson",data = data_filtered)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.9155 -0.8267 -0.1498 0.8527 4.9107
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 3.649360 0.353805 10.315 < 2e-16 ***
#> DayNumber 0.004957 0.005517 0.899 0.369
#> Post_event 0.714854 0.098185 7.281 3.32e-13 ***
#> intv_trend -0.051807 0.008034 -6.448 1.13e-10 ***
#> log(lag_count) 0.050323 0.089530 0.562 0.574
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for poisson family taken to be 1)
#>
#> Null deviance: 278.729 on 41 degrees of freedom
#> Residual deviance: 95.044 on 37 degrees of freedom
#> (1 observation deleted due to missingness)
#> AIC: 350.86
#>
#> Number of Fisher Scoring iterations: 4
# plot 1
test_res$plot1
#> `geom_smooth()` using formula 'y ~ x'
# plot 2
test_res$plot2
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 1 rows containing non-finite values (stat_smooth).
#> Warning: Removed 1 row(s) containing missing values (geom_path).
我不显示情节,但你懂的。
现在我将其包装在一个可以同时获取多个日期的函数中。理论上我们可以在同一个函数中做到这一点。
reg_fun_mult <- function(event_dates,dat) {
output <- lapply(event_dates,reg_fun,dat)
names(output) <- event_dates # give the list elements suitable names
return(output)
}
test_res_mult <- reg_fun_mult(eventDate,data)
# check if any variables were significant
sign <- sapply(test_res_mult,function(x) x$signif)
# look at the first ten results to see which ones have significant values
sign[1:10]
#> 2020-01-01 2020-01-02 2020-01-03 2020-01-04 2020-01-05 2020-01-06 2020-01-07
#> FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> 2020-01-08 2020-01-09 2020-01-10
#> FALSE FALSE FALSE
# keep only significant results
test_res_mult_sign <- test_res_mult[sign]
您可以再次查看单个摘要和图表,如下所示:
# summary
test_res_mult$`2020-01-01`$summary
#>
#> Call:
#> glm(formula = Count ~ DayNumber + Post_event + intv_trend + log(lag_count),data = data_filtered)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.3085 -0.8371 -0.2015 0.9650 1.4188
#>
#> Coefficients: (2 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 4.3952688 0.9248944 4.752 2.01e-06 ***
#> DayNumber 0.0001341 0.0050776 0.026 0.979
#> Post_event NA NA NA NA
#> intv_trend NA NA NA NA
#> log(lag_count) -0.1213342 0.2358350 -0.514 0.607
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for poisson family taken to be 1)
#>
#> Null deviance: 18.972 on 20 degrees of freedom
#> Residual deviance: 18.705 on 18 degrees of freedom
#> (1 observation deleted due to missingness)
#> AIC: 145.57
#>
#> Number of Fisher Scoring iterations: 4
# plot 1
test_res_mult$`2020-01-01`$plot1
#> `geom_smooth()` using formula 'y ~ x'
# plot 2
test_res_mult$`2020-01-01`$plot2
#> `geom_smooth()` using formula 'y ~ x'
#> Warning: Removed 1 rows containing non-finite values (stat_smooth).
#> Warning: Removed 1 row(s) containing missing values (geom_path).
如果有不清楚的地方,请告诉我。