R中的相关矩阵:基于阈值快速删除最少数量的特征

问题描述

假设我有一个相关矩阵:

library(data.table)

set.seed(1)
a <- matrix(rnorm(1000),nrow = 20,ncol = 50)
colnames(a) <- paste0('var',1:50)
a[1:5,1:5]
#             var1        var2       var3         var4       var5
# [1,] -0.62645381  0.91897737 -0.1645236  2.401617761 -0.5686687
# [2,]  0.18364332  0.78213630 -0.2533617 -0.039240003 -0.1351786
# [3,] -0.83562861  0.07456498  0.6969634  0.689739362  1.1780870
# [4,]  1.59528080 -1.98935170  0.5566632  0.028002159 -1.5235668
# [5,]  0.32950777  0.61982575 -0.6887557 -0.743273209  0.5939462

cor_mat <- cor(a)
cor_mat[1:5,1:5]
#            var1        var2       var3       var4        var5
# var1  1.0000000 -0.21752487  0.2976402 -0.1523604 -0.37085773
# var2 -0.2175249  1.00000000 -0.2839989  0.1778480 -0.06401162
# var3  0.2976402 -0.28399885  1.0000000  0.2180834  0.13805728
# var4 -0.1523604  0.17784796  0.2180834  1.0000000 -0.27922504
# var5 -0.3708577 -0.06401162  0.1380573 -0.2792250  1.00000000

我想使彼此之间具有绝对相关性值小于阈值的要素的数量尽可能保持最大。这是我所做的:

diag(cor_mat) <- 0

feature_removed <- c()
i = 1
temp = 1
threshold = 0.3

while (max(temp) > 0) {
  temp <- apply(cor_mat,1,function(x) {
    c(sum(abs(x) > threshold),sum(abs(x)))
  })
  
  temp <- as.data.table(t(temp))
  temp[,features := colnames(cor_mat)]
  
  setorder(temp,-V1,-V2)
  head(temp)
  #    V1        V2 features
  # 1: 18 12.19406    var15
  # 2: 15 11.16353    var49
  # 3: 15 10.82047     var8
  # 4: 14 10.99392    var39
  # 5: 13 10.69978    var31
  # 6: 13 10.52861     var3
  
  feature_removed[i] <- temp[1,features]
  
  indx <- which(colnames(cor_mat) %in% feature_removed[i])
  
  cor_mat <- cor_mat[-indx,-indx]; gc()
  
  temp <- temp[-1,V1]
  i = i+1
}

feature_removed
# [1] "var15" "var49" "var39" "var8"  "var38" "var31" "var40" "var41" "var14" "var27" "var22" "var45" "var18" "var11" "var23" "var4"  "var42" "var13"
# [19] "var43" "var1"  "var34" "var2"  "var46" "var3"  "var35" "var7"  "var5"  "var21" "var50" "var12" "var36" "var16" "var44" "var48" "var10" "var47"
# [37] "var25" "var33"

cor_mat <- cor(a)
indx <- which(! colnames(cor_mat) %in% feature_removed)
cor_mat[indx,indx]
#              var6        var9       var17       var19       var20        var24       var26       var28         var29         var30        var32       var37
# var6   1.00000000 -0.02263859 -0.13155794  0.07141167 -0.26713987  0.203598429  0.08252259 -0.14443949  0.1837023711  0.2563062065 -0.088011524 -0.19424194
# var9  -0.02263859  1.00000000 -0.11915503  0.01410899  0.15464526 -0.134541757 -0.03644167 -0.02363666  0.0989460178  0.1114800840  0.030616311  0.13378112
# var17 -0.13155794 -0.11915503  1.00000000 -0.10029683  0.17083857  0.227839929  0.06118896  0.05873093  0.0998719164 -0.2568822937 -0.094905918 -0.13075200
# var19  0.07141167  0.01410899 -0.10029683  1.00000000  0.09032814 -0.012338821  0.22697915 -0.07484026 -0.0148609726  0.1932314580 -0.103153852 -0.10459271
# var20 -0.26713987  0.15464526  0.17083857  0.09032814  1.00000000 -0.144317569 -0.03582327  0.14659201 -0.0626693480 -0.1783827493 -0.202125723 -0.03297363
# var24  0.20359843 -0.13454176  0.22783993 -0.01233882 -0.14431757  1.000000000  0.17266009  0.12499469 -0.0762715869  0.0407680053 -0.006395668  0.21197313
# var26  0.08252259 -0.03644167  0.06118896  0.22697915 -0.03582327  0.172660086  1.00000000 -0.12126656 -0.0170290134  0.0443483077 -0.004262910  0.06677876
# var28 -0.14443949 -0.02363666  0.05873093 -0.07484026  0.14659201  0.124994690 -0.12126656  1.00000000  0.1570735783 -0.1143660589 -0.045272571 -0.09274355
# var29  0.18370237  0.09894602  0.09987192 -0.01486097 -0.06266935 -0.076271587 -0.01702901  0.15707358  1.0000000000 -0.0009512571  0.052742960 -0.06363072
# var30  0.25630621  0.11148008 -0.25688229  0.19323146 -0.17838275  0.040768005  0.04434831 -0.11436606 -0.0009512571  1.0000000000  0.195731397  0.04830179
# var32 -0.08801152  0.03061631 -0.09490592 -0.10315385 -0.20212572 -0.006395668 -0.00426291 -0.04527257  0.0527429602  0.1957313969  1.000000000  0.10423393
# var37 -0.19424194  0.13378112 -0.13075200 -0.10459271 -0.03297363  0.211973131  0.06677876 -0.09274355 -0.0636307237  0.0483017908  0.104233926  1.00000000

我必须处理10000 x 10000相关矩阵,并且上述方法非常低。我想知道是否有更快的方法

非常感谢您!

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)