对于不同的指标,按唯一变量名称对变量求和 - 需要在变量名称的前缀之前/之后找到唯一名称

问题描述

是否有一种方法可以对数据框中所有唯一变量名称(可口可乐和百事可乐等品牌)的变量(例如销售额和单位)求和。 为了提供帮助,这里有一些示例数据。

set.seed(123)
period <- seq(as.Date('2021/01/01'),as.Date('2021/01/07'),by="day")
Coke_Regular_Units <- sample(1000:2000,7,replace = TRUE)
Coke_Diet_Units <- sample(1000:2000,replace = TRUE)
Coke_Regular_Sales <- sample(500:1000,replace = TRUE)
Coke_Diet_Sales <- sample(500:1000,replace = TRUE)
Pepsi_Regular_Units  <- sample(1000:2000,replace = TRUE)
Pepsi_Diet_Units  <- sample(1000:2000,replace = TRUE)
Pepsi_Regular_Sales <- sample(500:1000,replace = TRUE)
Pepsi_Diet_Sales <- sample(500:1000,replace = TRUE)
df <- data.frame(Coke_Regular_Units,Coke_Diet_Units,Coke_Regular_Sales,Coke_Diet_Sales,Pepsi_Regular_Units,Pepsi_Diet_Units,Pepsi_Regular_Sales,Pepsi_Diet_Sales)

> head(df)
      period Coke_Regular_Units Coke_Diet_Units Coke_Regular_Sales Coke_Diet_Sales Pepsi_Regular_Units
1 2021-01-01               1414            1117                589             847                1425
2 2021-01-02               1462            1298                590             636                1648
3 2021-01-03               1178            1228                755             976                1765
4 2021-01-04               1525            1243                696             854                1210
5 2021-01-05               1194            1013                998             827                1931
6 2021-01-06               1937            1373                590             525                1589
  Pepsi_Diet_Units Pepsi_Regular_Sales Pepsi_Diet_Sales
1             1554                 608              943
2             1870                 762              808
3             1372                 892              634
4             1843                 924              808
5             1142                 829              910
6             1543                 522              723

我喜欢自动计算 Coke_Sales、Coke_Units、Pepsi_Sales、Pepsi_Units、Regular_Sales 和 Diet_Units 的代码

我目前对每个变量都这样做

library(dplyr) 
df$Coke_Sales <- rowSums(Filter(is.numeric,select(df,(matches("Coke") & matches("Sales")))))
df$Coke_Units <- rowSums(Filter(is.numeric,(matches("Coke") & matches("Units")))))

这对少量变量没问题,但我需要对 100 多个变量执行此操作。是否有任何功能可以实现这一点?它需要自动找到唯一的变量名称,如 Coke、Pepsi、Diet 和 Regular。指标是变量名称的最后一部分,因此不一定需要自动查找它,但会很棒。如果方便的话,可以指定指标,因为最多只有 3 个指标,但有数百个品牌。

如果它不能自动化,有没有办法可以简化它,我指定所需的变量。不完美,但仍有改进。例如,包含这些代码行以指定要求和的变量和所需的指标。

VarsToSum <- c("Coke","Pepsi","Diet","Regular")
Metrics <- c("Sales","Units")

如果它也不能以这种方式完成,也许我需要分解成更小的步骤,任何提示都会很棒。试图思考如何做到这一点,我是否应该尝试在前缀“_”之前找到唯一名称,然后计算这些唯一名称的“销售额”和“单位”。这会是最好的方法吗?还是我应该重塑数据?还有其他路线可以到达吗?

任何帮助或指导如何实现这一点将不胜感激。谢谢

解决方法

这是一种data.table方法...

library( data.table )
setDT(df) #make it a data.table
#melt to long
ans <- melt( df,id.vars = "period",variable.factor = FALSE )
#split variable to 3 new columns
ans[,c("brand","type","what") := tstrsplit( variable,"_" ) ]
# > head(ans)
#        period           variable value brand    type  what
# 1: 2021-01-01 Coke_Regular_Units  1414  Coke Regular Units
# 2: 2021-01-02 Coke_Regular_Units  1462  Coke Regular Units
# 3: 2021-01-03 Coke_Regular_Units  1178  Coke Regular Units
# 4: 2021-01-04 Coke_Regular_Units  1525  Coke Regular Units
# 5: 2021-01-05 Coke_Regular_Units  1194  Coke Regular Units
# 6: 2021-01-06 Coke_Regular_Units  1937  Coke Regular Units

#summarise however you like
ans[,.(total = sum(value) ),by = .(brand,type,what)]
#    brand    type  what total
# 1:  Coke Regular Units 10527
# 2:  Coke    Diet Units  8936
# 3:  Coke Regular Sales  5158
# 4:  Coke    Diet Sales  5171
# 5: Pepsi Regular Units 11160
# 6: Pepsi    Diet Units 10813
# 7: Pepsi Regular Sales  5447
# 8: Pepsi    Diet Sales  5491
,

使用 outer paste 音节和 grep

sapply(outer(c("Coke","Pepsi"),c("Sales","Units"),paste,sep=".*"),function(x)
  rowSums(df[grep(x,names(df))]))
#      Coke.*Sales Pepsi.*Sales Coke.*Units Pepsi.*Units
# [1,]        1436         1551        2531         2979
# [2,]        1226         1570        2760         3518
# [3,]        1731         1526        2406         3137
# [4,]        1550         1732        2768         3053
# [5,]        1825         1739        2207         3073
# [6,]        1115         1245        3310         3132
# [7,]        1446         1575        3481         3081
,

这里的解决方案在精神上与@Wimpel 的解决方案相似,但使用了 tidyverse :

library(tidyverse)

summary_df  <-
  df %>%
  pivot_longer(cols = ends_with("Sales") | ends_with("Units"),names_to = c("brand",".value"),names_pattern = "(.*)_(.*)_(.*)") %>%
  group_by(brand) %>%
  summarize(Sales = sum(Sales),Units = sum(Units)) %>%
  pivot_wider(names_from = "brand",values_from = c("Sales",names_glue = "{brand}_{.value}")

  summary_df
  # # A tibble: 1 x 4
  #   Coke_Sales Pepsi_Sales Coke_Units Pepsi_Units
  #       <int>       <int>      <int>       <int>
  # 1      10329       10938      19463       21973