问题描述
我有两组 first_data。第一个是 100 (x_0,y_0) 个数据:
x_0 <- seq(1,10,by=1)
y_0 <- seq(1,by=1)
data <- expand.grid(x_0,y_0)
第二个是 5 个 (x,y) 数据,称为 second_data:
x <- c(2,4,6,8,10)
y <- c(3,5,7,9,11)
color <- c("green","green","red","red")
second_data<- data.frame(x,y,color)
我需要为 3NN 应用欧几里得距离公式,以根据欧几里得距离确定第一个数据集中的每个点是绿色还是红色。基本上,我需要找到每100对点的距离,5次,然后使用下面的代码选择距离最小的3个。
我想我需要一个循环,但我没有正确理解:
out <- rep(NA,nrow(first_data))
K=3
for(k in 1:nrow(first_data)){
green <- mutate(second_data,distance = sqrt(x - first_data[k]^2)+(y-first_data[k]^2)) %>%
slice_min(distance,n=K) %>% filter(color=='green') %>% nrow()
out[k] <- ifelse(new_blue >= (K+1)/2,'green','red')
}
解决方法
如果我理解正确的话,get.knn
包中的 FNN
函数会很容易做到这一点:
library(FNN)
neighbors3 <- get.knnx(second_data[,-3],data,k=3)
str(neighbors3)
# List of 2
# $ nn.index: int [1:100,1:3] 1 1 1 1 1 1 2 2 2 3 ...
# $ nn.dist : num [1:100,1:3] 2.24 2 2.24 2.83 3.61 ...
head(neighbors3$nn.index)
# [,1] [,2] [,3]
# [1,] 1 2 3
# [2,] 1 2 3
# [3,] 1 2 3
# [4,] 1 2 3
# [5,] 1 2 3
# [6,] 1 2 3
列表元素 neighbors3$nn.index
为 second_data
中的每一行提供 data
中的三个最近邻。现在获取邻居的颜色:
result <- matrix(color[neighbors3$nn.index],100,3)
head(result); cat("\n"); tail(result)
# [,1] [,2] [,3]
# [1,] "green" "green" "red"
# [2,] "green" "green" "red"
# [3,] "green" "green" "red"
# [4,] "green" "green" "red"
# [5,] "green" "green" "red"
# [6,] "green" "green" "red"
#
# [,1] [,2] [,3]
# [95,] "red" "red" "green"
# [96,] "red" "red" "red"
# [97,] "red" "red" "red"
# [98,] "red" "red" "red"
# [99,] "red" "red" "red"
# [100,] "red" "red" "red"
如果需要,您可以将所有内容与原始数据合并:
results <- cbind(data,neighbors3$nn.index,result,neighbors3$nn.dist)
colnames(results) <- c("x0","y_0","nn1","nn2","nn3","col1","col2","col3","dist1","dist2","dist3")
head(results)
# x0 y_0 nn1 nn2 nn3 col1 col2 col3 dist1 dist2 dist3
# 1 1 1 1 2 3 green green red 2.236068 5.000000 7.810250
# 2 2 1 1 2 3 green green red 2.000000 4.472136 7.211103
# 3 3 1 1 2 3 green green red 2.236068 4.123106 6.708204
# 4 4 1 1 2 3 green green red 2.828427 4.000000 6.324555
# 5 5 1 1 2 3 green green red 3.605551 4.123106 6.082763
# 6 6 1 1 2 3 green green red 4.472136 4.472136 6.000000
,
如果我正确理解,这将产生正确的分类
data$color <- NA
k <- 3
for (i in 1:nrow(data)){
d <- data.frame()
dat_aux <- second_data
for (j in 1:k){
d_j <- which.min((dat_aux$x - data$Var1[i])^2 + (dat_aux$y - data$Var2[i])^2)
d <- c(d,dat_aux$color[d_j])
dat_aux[d_j,] <- NA
}
data$color[i] <- names(sort(table(unlist(d)),decreasing = T))[1]
}
data