问题描述
我正在创建一个汇总统计表,该表涉及来自四个不同区域(西部,中西部,东北和南部)的样本中的健康调查数据,编码为1-4区。现在,我可以创建一个表,以显示每个区域的统计数据,但是我还想添加一个额外的列,该列显示样本中所有个体的总体平均值或中位数。我怎样才能做到这一点?我已经包含了到目前为止完成的代码。谢谢!
#the data frame
structure(list(AGE = c(40L,23L,24L,18L,30L,33L,32L,63L,22L,24L),FAMSIZE = c(2L,2L,3L,6L,1L,1L
),HYPERTEN = c(0,1,0),ALC = c(0,2,3,2),region_group = c("Region 4","Region 3","Region 4","Region 1","Region 2","Region 4"),PSU = c(2L,2L),PERWEIGHT_MERGE = c(31.2,615.2,37.6,1626,44,149.8,745.2,984.2,1512,399.6),SAMPWEIGHT_MERGE = c(65,860.4,94.4,9146,170.8,310.4,755.2,1053.4,3964.4,706.2),STRATA = c(6296L,6165L,6296L,6224L,6045L,6083L,6029L,6073L,6287L,6247L
)),row.names = c(NA,10L),class = "data.frame")
#Code for the table
out1<-sample_survey %>%
group_by(region_group) %>%
summarise("Number of drinks (mean)"=survey_mean(ALC),"Number of drinks (median)"=survey_median(ALC),"Hypertension"=survey_mean(HYPERTEN),"Family
Size"=survey_mean(FAMSIZE),"Age"=survey_median(AGE))
out1=t(out1)
out1
[,1] [,2] [,3] [,4]
region_group "Region 1" "Region 2" "Region 3" "Region 4"
Number of drinks (mean) "1.663778" "2.131566" "1.744107" "2.009594"
Number of drinks (mean)_se "0.1375124" "0.1245772" "0.0957500" "0.1199982"
Number of drinks (median) "1" "2" "1" "2"
Number of drinks (median)_se "0.0000000" "0.2531528" "0.0000000" "0.2533324"
Hypertension "0.1340147" "0.1685102" "0.1834528" "0.1225418"
Hypertension_se "0.01623974" "0.01529678" "0.01463019" "0.01475651"
Family \n Size "3.121062" "2.883905" "3.107202" "3.265012"
Family \n Size_se "0.11668906" "0.07435704" "0.08004129" "0.11138869"
Age "30" "27" "30" "28"
Age_se "1.3615690" "1.0126110" "0.7616152" "0.7599972"
解决方法
虽然我非常肯定有一些软件包可以立即实现,但要实现这一目标的一种方法是对整个数据集重复摘要,并在转置之前将表绑定在一起。为此,为了减少代码重复,我将摘要代码放在了辅助函数中。试试这个:
library(dplyr)
library(srvyr)
# Helper function
mysum <- function(d) {
d %>%
summarise(
"Number of drinks (mean)" = survey_mean(ALC),"Number of drinks (median)" = survey_median(ALC),"Hypertension" = survey_mean(HYPERTEN),"Family Size" = survey_mean(FAMSIZE),"Age" = survey_median(AGE)
)
}
sample_survey <- sample_survey %>%
as_survey_design(strata = region_group)
#Code for the table
out1<-sample_survey %>%
group_by(region_group) %>%
mysum()
out2<-sample_survey %>%
mysum() %>%
mutate(region_group = "Total")
bind_rows(out1,out2) %>%
t()
#> [,1] [,2] [,3]
#> region_group "Region 1" "Region 2" "Region 3"
#> Number of drinks (mean) "2.50" "0.00" "1.00"
#> Number of drinks (mean)_se "0.5000000" "0.0000000" "1.0000000"
#> Number of drinks (median) "2" "0" "0"
#> Number of drinks (median)_se "0.03935085" "0.00000000" "0.07870171"
#> Hypertension "0.0" "0.5" "0.0"
#> Hypertension_se "0.0" "0.5" "0.0"
#> Family Size "2.00" "3.50" "2.50"
#> Family Size_se "0.0000000" "2.5000000" "0.5000000"
#> Age "30" "33" "18"
#> Age_se "0.07870171" "1.18052560" "0.19675427"
#> [,4] [,5]
#> region_group "Region 4" "Total"
#> Number of drinks (mean) "1.75" "1.40"
#> Number of drinks (mean)_se "0.6291529" "0.3366502"
#> Number of drinks (median) "2" "2"
#> Number of drinks (median)_se "0.47133552" "0.50276759"
#> Hypertension "0.0" "0.1"
#> Hypertension_se "0.0" "0.1"
#> Family Size "1.75" "2.30"
#> Family Size_se "0.2500000" "0.5196152"
#> Age "24" "24"
#> Age_se "2.82801315" "2.02169598"
,
您还可以运行相同的计算,而无需包括group_by
,然后将新的结果列绑定到您创建的第一个表。 (我不得不回过头来将survey_mean
和survey_median
分别更改为mean
和median
,否则就给我带来了一个错误。)
library(dplyr)
library(srvyr)
#the data frame
sample_survey <- structure(list(AGE = c(40L,23L,24L,18L,30L,33L,32L,63L,22L,24L),FAMSIZE = c(2L,2L,3L,6L,1L,1L),HYPERTEN = c(0,1,0),ALC = c(0,2,3,2),region_group = c("Region 4","Region 3","Region 4","Region 1","Region 2","Region 4"),PSU = c(2L,2L),PERWEIGHT_MERGE = c(31.2,615.2,37.6,1626,44,149.8,745.2,984.2,1512,399.6),SAMPWEIGHT_MERGE = c(65,860.4,94.4,9146,170.8,310.4,755.2,1053.4,3964.4,706.2),STRATA = c(6296L,6165L,6296L,6224L,6045L,6083L,6029L,6073L,6287L,6247L )),row.names = c(NA,10L),class = "data.frame")
#Code for the table
out1<-sample_survey %>%
group_by(region_group) %>%
summarise("Number of drinks (mean)"=mean(ALC),"Number of drinks (median)"=median(ALC),"Hypertension"=mean(HYPERTEN),"Family
Size"=mean(FAMSIZE),"Age"=median(AGE))
out1=t(out1)
#new code
out2 <- sample_survey %>%
summarise("All Regions" = "All Regions","Number of drinks (mean)"=mean(ALC),"Age"=median(AGE))
out2 = t(out2)
cbind(out1,out2)
> cbind(out1,out2)
[,1] [,2] [,3] [,4] [,5]
region_group "Region 1" "Region 2" "Region 3" "Region 4" "All Regions"
Number of drinks (mean) "2.50" "0.00" "1.00" "1.75" "1.4"
Number of drinks (median) "2.5" "0.0" "1.0" "2.0" "2"
Hypertension "0.0" "0.5" "0.0" "0.0" "0.1"
Family \n Size "2.00" "3.50" "2.50" "1.75" "2.3"
Age "31.0" "48.0" "20.5" "24.0" "27"