问题描述
我有以下df:
df_1=data.frame(col_1=c("a;b;c","c;d","e","f","g","h;j"),col_2=c("1;2;3","4","5;6","7","8;9","10;11;12"))
所以我想将col_1分隔为具有col_2对应值(如果存在)的单独行。
-
例如,如果col_1中的元素数= col_2中的元素数,则应将它们与col_1和col_2中的相应值分开(第1行)
-
如果元素的数量不成比例(每个元素超过1个且不相等),则应保持原样
这是final_dataset:
df_2=data.frame(col_1=c("a","b","c","d",col_2=c("1","2","3","5","6","8","9","10;11;12"))
解决方法
我们可以使用cSplit
library(splitstackshape)
library(zoo)
cnt1 <- nchar(gsub(";","",df_1$col_1))
cnt2 <- nchar(gsub(";",df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
rbind(cSplit(df_1[!i1,],c('col_1','col_2'),sep=";","long")[
!is.na(col_1)|!is.na(col_2),lapply(.SD,na.locf0)],df_1[i1,])
# col_1 col_2
# 1: a 1
# 2: b 2
# 3: c 3
# 4: c 4
# 5: d 4
# 6: e 5
# 7: e 6
# 8: f 7
# 9: g 8
#10: g 9
#11: h;j 10;11;12
或在所有约束条件下使用base R
cnt1 <- nchar(gsub(";",df_1$col_2))
i1 <- cnt1 != cnt2 & cnt1 > 1 & cnt2 > 1
lst1 <- lapply(df_1[!i1,function(x) strsplit(x,";"))
out <- rbind(do.call(rbind,Map(function(x,y) {
l1 <- length(x)
l2 <- length(y)
mx <- max(l1,l2)
x <- if(l1 != l2 & l1 == 1) rep(x,mx) else x
y <- if(l1 != l2 & l2 == 1) rep(y,mx) else y
data.frame(col_1 = x,col_2 = y) },lst1[[1]],lst1[[2]])),])
row.names(out) <- NULL
out
# col_1 col_2
#1 a 1
#2 b 2
#3 c 3
#4 c 4
#5 d 4
#6 e 5
#7 e 6
#8 f 7
#9 g 8
#10 g 9
#11 h;j 10;11;12
,
这是通过定义自定义函数f
f <- function(v) {
X <- unlist(strsplit(v[[1]],";"))
Y <- unlist(strsplit(v[[2]],";"))
if (length(X) == length(Y) || min(length(X),length(Y))==1) {
res <- data.frame(col_1 = X,col_2 = Y)
} else {
res <- data.frame(col_1 = v[[1]],col_2 = v[[2]])
}
res
}
df_2 <- do.call(rbind,apply(df_1,1,f))
我们将会得到
col_1 col_2
1 a 1
2 b 2
3 c 3
4 c 4
5 d 4
6 e 5
7 e 6
8 f 7
9 g 8
10 g 9
11 h;j 10;11;12