问题描述
我有两个数据框:
df1 <- data.frame(levels = c(1,3,5,7,9),values = c(2.2,5.3,7.9,5.4,8.7))
df2 <- data.frame(levels = c(1,4,8,12)) # other columns not necessary
我希望根据 df1$levels 中的数字将 df1$values 插入到 df2$levels 中。所以有一些内插,但也有外推到第二个数据帧中的第 12 级。
解决方法
也许,根据两个数据集的 complete
的 union
执行 levels
,然后将 na.approx
(来自 zoo
)与 {{1} }(用于外推)
rule = 2
-输出
library(dplyr)
library(tidyr)
library(zoo)
df1 <- df1 %>%
complete(levels = union(levels,df2$levels)) %>%
mutate(values = na.approx(values,maxgap = Inf,rule = 2))
,
我确定这可以压缩,这是我很久以前写的一些代码,用于处理必须在有序向量的头部/尾部进行外推的情况:
# Function to interpolate / extrapolate: l_estimate => function()
l_estimate <- function(vec){
# Function to perform-linear interpolation and return vector:
# .l_interp_vec => function()
.l_interp_vec <- function(vec){
interped_values <-
approx(x = vec,method = "linear",ties = "constant",n = length(vec))$y
return(ifelse(is.na(vec),interped_values[is.na(vec)],vec))
}
# Store a vector denoting the indices of the vector that are NA:
# na_idx => integer vector
na_idx <- is.na(vec)
# Store a scalar of min row where x isn't NA: min_non_na => integer vector
min_non_na <- min(which(!(na_idx)))
# Store a scalar of max row where x isn't NA: max_non_na => integer vector
max_non_na <- max(which(!(na_idx)))
# Store scalar of the number of rows needed to impute prior
# to first NA value: ru_lower => integer vector
ru_lower <- ifelse(min_non_na > 1,min_non_na - 1,min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: ru_upper => integer vector
ru_upper <- ifelse(
max_non_na == length(vec),length(vec) - 1,(length(vec) - (max_non_na + 1))
)
# Store a vector of the ramp to function: ramp_up => numeric vector
ramp_up <- as.numeric(
cumsum(rep(vec[min_non_na]/(min_non_na),ru_lower))
)
# Apply the interpolation function on vector: y => numeric vector
y <- as.numeric(.l_interp_vec(as.numeric(vec[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1:
if(length(ramp_up) >= 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up,y))),ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(ramp_up,y,lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(vec)){
# Store the linear interpolations in a vector: z => numeric
z <- as.numeric(c(ramp_up,y))
}else if(min_non_na == 1 & max_non_na != length(vec)){
# Create a vector interpolations if there are
# multiple NA values after the last value: lower_l_int => numeric vector
lower_l_int <- as.numeric(
cumsum(rep(mean(diff(c(ramp_up,ru_upper+1)) +
as.numeric(vec[max_non_na])
)
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(c(y,lower_l_int))
}else{
# Store the linear interpolations in a vector: z => numeric vector
z <- as.numeric(y)
}
# Interpolate between points in x,return new x:
return(as.numeric(ifelse(is.na(vec),z,vec)))
}
# Apply the function on ordered data: data.frame => stdout(console)
transform(full_df[order(full_df$levels),],values = l_estimate(values)
)