使用 R 中的 dbplyr 计算 SQL 表每行的 TRUE/FALSE 值

问题描述

我确实使用 dbplyr 远程连接到 sql 表。其中一个表由一个 ID 列和其他几个存储 0 和 1 值的列组成（sql bit - 解释为来自 R 端的布尔值 TRUE/FALSE 值），并且从 RI 中只想获得 1 的总数对于每一行。

在 R 中使用例如 rowSums() 的常用表很简单，不幸的是它不能通过 dbplyr（没有 sql 等效项）工作。

由于基础表的大小，显而易见的原因我不想collect()数据。

在这样的背景下如何实现这一目标？

library(dplyr)
# Local case
DF <- tibble(ID = LETTERS[1:3],col1 = c(1,1,1),col2 = c(1,0),col3 = c(1,0))
DF %>% 
  summarise(sum = rowSums(select(.,-1)))
#   sum
# 1   3
# 2   2
# 3   1

# If DF is a remote sql table,therefore one would get the following error message:  
# Error: nanodbc/nanodbc.cpp:1655: 42000: [Microsoft][ODBC sql Server Driver][sql Server]'rowSums' is not a recognized built-in function name.  [Microsoft][ODBC sql Server Driver][sql Server]

编辑 - 添加最小的可重现示例

关注@Simon.S.A.回复，在 MRE 下方：

# Table creation
DF <- tibble(ID = LETTERS[1:3],0))
colnames(DF) <- c("col 1","col 2","col 3","col 4")
# sql simulation
con <- DBI::dbConnect(Rsqlite::sqlite(),":memory:")
copy_to(con,DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
all_equations <- paste0("`",cols,"` =  sum(`","`)")
# actual query
con %>% 
  tbl("DF") %>% 
  summarise(!!!rlang::parse_exprs(all_equations))
# Error: near "=": Syntax error
# %>% show_query() shows a strange query,but I am no sql expert as you understood.
# also tried: 
# all_equations <- paste(cols,"=  sum(",")")
# all_equations <- paste0("`[","]` =  sum(`[","]`)")

解决方法

这里的部分挑战是 dbplyr 将 dplyr 命令翻译成 SQL，但翻译只为某些 R 命令定义。由于标准 dplyr 命令存在翻译，我们可以使用 summarise。

总结一下，我们可以做到以下几点：

library(dplyr)
library(rlang)

cols = colnames(DF)
cols = cols[2:length(cols)]

all_equations = paste(cols,"=  sum(",cols,")")


DF %>%
  summarise(!!!parse_exprs(all_equations))

想法是构建每个和的文本字符串，然后使用 !!!parse_exprs(.) 将此文本转换为 R 代码。

编辑 - 相同的方法，但用于行总和

# Table creation
DF <- tibble(ID = LETTERS[1:3],col1 = c(1,1,1),col2 = c(1,0),col3 = c(1,0))
colnames(DF) <- c("col 1","col 2","col 3","col 4")
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(),":memory:")
copy_to(con,DF)
con %>% tbl("DF") # just checking
#preparing formula
cols <- colnames(DF)[-1]
eq <- paste0("`",paste0(cols,collapse = "` + `"),"`")
# actual query
con %>% 
  tbl("DF") %>% 
  mutate(new = !!parse_expr(eq))

但仍取决于 dbplyr 翻译，因此可能无法正确处理反引号。

我发现一种可能的解决方法是写下实际查询，例如使用 DBI 包。但我仍然对使用 dbplyr 的更优雅的方式感兴趣。

DF <- tibble(ID = LETTERS[1:3],"col 4") # having spaces in column names increase handling complexity
# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(),DF)
con %>% tbl("DF") # just checking
cols <- colnames(DF)[-1]
col2select <- colnames(DF) # column to select in the result
query <- paste0("SELECT ",paste0("[",col2select,"]",collapse =","),",# paste0("CAST([","] AS INT)",collapse = " + ")," AS sum FROM DF")
rs <- DBI::dbSendQuery(con,query)
DBI::dbFetch(rs)
DBI::dbClearResult(rs)
DBI::dbDisconnect(con)

使用 tidyr 使得 dplyr 代码可读性很强。这在大表上的表现如何还不太清楚。

library(dplyr,warn.conflicts = FALSE)
library(tidyr)
DF <- tibble(ID = LETTERS[1:3],0))

# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(),":memory:")
DF <- copy_to(con,DF,overwrite = TRUE)

result <- 
  DF %>% 
  pivot_longer(cols = -ID) %>% 
  group_by(ID) %>% 
  summarize(sum = sum(value,na.rm = TRUE))

result
#> # Source:   lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#>   ID      sum
#>   <chr> <dbl>
#> 1 A         3
#> 2 B         2
#> 3 C         1

result %>% show_query()
#> <SQL>
#> SELECT `ID`,SUM(`value`) AS `sum`
#> FROM (SELECT `ID`,'col1' AS `name`,`col1` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`,'col2' AS `name`,`col2` AS `value`
#> FROM `DF`
#> UNION ALL
#> SELECT `ID`,'col3' AS `name`,`col3` AS `value`
#> FROM `DF`)
#> GROUP BY `ID`

^{由 reprex package (v2.0.0) 于 2021 年 6 月 18 日创建}

library(dplyr,warn.conflicts = FALSE)
library(DBI)

n <- 26e3

df <- tibble(ID = rep(LETTERS,n/26))

for (i in 1:100) df[[paste0("col",i)]] <- rbinom(prob = 0.5,n = n,size = 1)

# SQL simulation
con <- DBI::dbConnect(RSQLite::SQLite(),":memory:")

df_sql <- copy_to(con,df,overwrite = TRUE)
row_sum_1 <- function(df,con) {
    sum_cols <- setdiff(colnames(df),"ID")
    names <- paste(DBI::dbQuoteIdentifier(con,colnames(df)),collapse = ",")
    sum_sql <- paste(DBI::dbQuoteIdentifier(con,sum_cols),collapse = " + ")
    
    query <- paste0("SELECT ",names,sum_sql," AS sum FROM df")
    tbl(con,sql(query))
}

row_sum_1(df_sql,con) %>% select(ID,sum)
#> # Source:   lazy query [?? x 2]
#> # Database: sqlite 3.35.5 [:memory:]
#>    ID      sum
#>    <chr> <int>
#>  1 A        49
#>  2 B        53
#>  3 C        54
#>  4 D        49
#>  5 E        51
#>  6 F        46
#>  7 G        55
#>  8 H        48
#>  9 I        44
#> 10 J        50
#> # … with more rows
system.time(compute(row_sum_1(df_sql,con)))
#>    user  system elapsed 
#>   0.307   0.007   0.315

^{由 reprex package (v2.0.0) 于 2021 年 6 月 21 日创建}

dbplyr r r sql-server