R使用tidyverse用其他值的总和填充缺失值

问题描述

我有一个包含多列多行的数据框。

col_1 | col_2 | ... | col_n
 35   |  NA   | ... |   2
  .   |   .   |  .  |   .
  .   |   .   |  .  |   .
  .   |   .   |  .  |   .
 123  |  90   | ... |   NA

某些行包含 NA 值（可以超过 1 个 NA）。

我希望找到恰好包含 1 个 NA 的所有行，并将其替换为其他列的总和。

我如何使用 tidyverse 实现它？

解决方法

我使用了 Anil Goyal 的玩具数据（谢谢！）

今天有一个类似的问题，请看这里： R: Replace NA with other variables in the df using tidyverse

我们在这里：

对行求和
计算 NA 的数量
并通过在 col1-4 上应用所需的条件
我最喜欢的是 .keep = "unused"，它删除了“helper”列。

df %>% 
  mutate(rowsum1 = rowSums(.,na.rm=TRUE)) %>%
  mutate(count_na = rowSums(is.na(select(.,everything())))) %>% 
  mutate(across(starts_with("col"),~case_when(count_na ==1 ~coalesce(.,rowsum1),TRUE ~ as.numeric(.))
                ),.keep ="unused"
         )

输出：

 col_1 col_2 col_3 col_4
1    35   421  1223   767
2    43    54   435    78
3   234    NA    NA    65
4   784     8   687    89
5    23    45    78   146

两种tidyverse方法

#toy data
df <- data.frame(
  col_1 = c(35L,43L,234L,NA,23L),col_2 = c(421L,54L,8L,45L),col_3 = c(NA,435L,687L,78L),col_4 = c(767L,78L,65L,89L,NA)
)
df
#>   col_1 col_2 col_3 col_4
#> 1    35   421    NA   767
#> 2    43    54   435    78
#> 3   234    NA    NA    65
#> 4    NA     8   687    89
#> 5    23    45    78    NA

#load libraries
library(tidyverse)

#1st approach
df %>% mutate(across(everything(),~ ifelse(rowSums(is.na(cur_data())) == 1 & is.na(.),rowSums(cur_data(),na.rm = T),.)))
#>   col_1 col_2 col_3 col_4
#> 1    35   421  1223   767
#> 2    43    54   435    78
#> 3   234    NA    NA    65
#> 4   784     8   687    89
#> 5    23    45    78   146

#2nd approach
df %>% rowwise() %>%
  mutate(replace(cur_data(),is.na(cur_data()) & rowSums(is.na(cur_data())) == 1,na.rm = T)))
#> # A tibble: 5 x 4
#> # Rowwise: 
#>   col_1 col_2 col_3 col_4
#>   <int> <int> <int> <int>
#> 1    35   421  1223   767
#> 2    43    54   435    78
#> 3   234    NA    NA    65
#> 4   784     8   687    89
#> 5    23    45    78   146

^{由 reprex package (v2.0.0) 于 2021 年 6 月 5 日创建}

我们可以将 coalesce 与 rowSums 一起使用以提高效率

library(dplyr)
v1 <- rowSums(df,na.rm = TRUE) * NA^(rowSums(is.na(df)) != 1)
df[] <- coalesce(unlist(df),v1[row(df)])

-输出

 df
  col_1 col_2 col_3 col_4
1    35   421  1223   767
2    43    54   435    78
3   234    NA    NA    65
4   784     8   687    89
5    23    45    78   146

或者也可以把上面写成

df %>%
     mutate(rS = rowSums(cur_data(),na.rm = TRUE) * 
        NA^(rowSums(is.na(cur_data())) != 1),across(-rS,~ coalesce(.,rS)),rS = NULL)
#  col_1 col_2 col_3 col_4
#1    35   421  1223   767
#2    43    54   435    78
#3   234    NA    NA    65
#4   784     8   687    89
#5    23    45    78   146

您也可以使用以下解决方案，它是 replace 函数的替代：

library(dplyr)
library(purrr)

df %>% 
  pmap_df(.,~ if(sum(is.na(c(...))) == 1) {
    coalesce(c(...),sum(c(...),na.rm = TRUE)) 
  } else {
    c(...)
  }
)

# A tibble: 2 x 4
  col_1 col_2 col_3 col_n
  <dbl> <dbl> <dbl> <dbl>
1    35    42     5     2
2   123    90    NA    NA

数据

structure(list(col_1 = c(35,123),col_2 = c(NA,90),col_3 = c(5,NA),col_n = c(2,NA)),class = "data.frame",row.names = c(NA,-2L))

对之前的答案 here 稍作修改，只检查一行中的 1 个 NA -

df<- t(apply(df,1,function(x) {
  if(sum(is.na(x)) == 1)  replace(x,is.na(x),sum(x,na.rm = TRUE))
  else x
}))

或使用 purrr::pmap_df :

df <- purrr::pmap_df(df,~{
  x <- c(...)
  if(sum(is.na(x)) == 1)  replace(x,na.rm = TRUE))
  else x
})

使用两个中间变量的替代解决方案：

q_na 每行 NA 数
s_row 行值的总和，不包括 NA。

library(tidyverse)

df <- tribble(
~col_1,~col_2,~col_3,3,2,5,6,2)

df %>% 
  rowwise() %>% 
  mutate(q_na = sum(is.na(c_across(col_1:col_3))),s_row = sum(c_across(col_1:col_3),na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter(q_na == 1) %>% 
  mutate(across(col_1:col_3,~if_else(is.na(.x),s_row,.x))) %>% 
  dplyr::select(col_1:col_3)

#> # A tibble: 2 x 3
#>   col_1 col_2 col_3
#>   <dbl> <dbl> <dbl>
#> 1     4     1     3
#> 2     3     5     2

如果您想获取所有行，只需删除 filter 并将其包含在 if_else 中：

df %>% 
  rowwise() %>% 
  mutate(q_na = sum(is.na(c_across(col_1:col_3))),na.rm = TRUE)) %>% 
  ungroup() %>% 
  filter() %>% 
  mutate(across(col_1:col_3,~if_else(q_na == 1 & is.na(.x),.x))) %>% 
  dplyr::select(col_1:col_3)

#> # A tibble: 5 x 3
#>   col_1 col_2 col_3
#>   <dbl> <dbl> <dbl>
#> 1     4     1     3
#> 2    NA    NA     2
#> 3     1     5     6
#> 4    NA    NA    NA
#> 5     3     5     2

^{由 reprex package (v0.3.0) 于 2021 年 6 月 5 日创建}

missing-data na na r r tidyverse