问题描述
我本质上试图做的是比较一个有限数量的行和列的数据集 - 其中每一行都被验证是正确的,另一个数据集会随着时间的推移而改变,我需要确定哪个第二个数据集中的行与第一个不匹配。这可以比作我有答案键(数据帧 1)和一个包含数百万行的数据帧,我需要将这些行与答案键进行比较以确定哪些行匹配。
我已经阅读了很多解决方案,但还没有找到一个简洁的解决方案 - 有什么建议吗?
"C:\Users\Public\Documents\Embarcadero\Studio\20.0\CatalogRepository\AndroidSDK-2525_20.0.36039.7899\build-tools\28.0.2\dx.bat" --dex --output="C:\PRJ\Delphi\FMX\ImgVwr\Android64\Debug\classes.dex" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\android-support-v4.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\cloud-messaging.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-ads-base.17.2.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-ads-identifier.16.0.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-ads-lite.17.2.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-ads.17.2.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-analytics-impl.16.0.8.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-analytics.16.0.8.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-base.16.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-basement.16.2.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-gass.17.2.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-identity.16.0.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-maps.16.1.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-measurement-base.16.4.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-measurement-sdk-api.16.4.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-stats.16.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-tagmanager-v4-impl.16.0.8.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-tasks.16.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-android-gms.play-services-wallet.16.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-analytics.16.4.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-common.16.1.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-iid-interop.16.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-iid.17.1.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-measurement-connector.17.0.1.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\com-google-firebase.firebase-messaging.17.5.0.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\fmx.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\google-play-billing.dex.jar" "c:\program files (x86)\embarcadero\studio\20.0\lib\android\Debug\google-play-licensing.dex.jar"
数据集 2
library(tibble)
df1 <- tribble(
~bc,~var1,~var2,~var3,"A",324,468,462,"B",223,362,328,"C",187,200,229,"D",286,455,423)
我想要做的是通过与数据集 1 匹配的变量 bc 检查数据集 2。例如,数据集 2 中的第二个 D bc 不匹配与数据集 1 不匹配
解决方法
没有看到任何数据很难回答你。
使用 which 函数可以告诉您哪些行符合某些条件。 下面是一个如何使用which的例子。 您可以将其更改为 which(df2$answers %in% df1$answer_key) 或类似内容
# Load the data
data(iris)
# Take a look
head(iris)
which_example <- c(5.4,4.6)
# The way I think of which is to ask R "which rows in iris$Sepal.Length are 5.4?"
which(iris$Sepal.Length %in% 5.4)
which(iris$Sepal.Length %in% which_example)
# Once you have the rows,you can display only those specific rows and all or some columns
# The format is df[row,column]
# Which gives the rows. You can leave column blank to get all or enter specific ones
iris[which(iris$Sepal.Length %in% 5.4),]
iris[which(iris$Sepal.Length %in% 5.4),c(2,4)]
,
您或许可以使用 setdiff
包中的 dplyr
函数。它显示了不同的行。
library(dplyr)
# Create example data frame
mtcars_edit <- mtcars
# Change the 1,3,and 5 of wt column to 1
mtcars_edit$wt[c(1,5)] <- 1
# Apply the setdiff function
setdiff(mtcars,mtcars_edit)
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160 110 3.90 2.62 16.46 0 1 4 4
# Datsun 710 22.8 4 108 93 3.85 2.32 18.61 1 1 4 1
# Hornet Sportabout 18.7 8 360 175 3.15 3.44 17.02 0 0 3 2
,
正如 Ronak Shah 在评论中所述,如果我们正确理解您,anti-join
可能是这里的解决方案。这里有更简单的数据集,以便于复制。为了更容易发现不匹配,在示例数据中它们是负数。
此外,为了识别哪些列不匹配,您可以添加一个 id
变量。
library(tidyverse)
df1 <- tribble(
~bc,~var1,~var2,~var3,"A",1,2,"B",4,5,6,)
df2 <- tribble(
~bc,-1,)
# add id to identify which rows are not matching
df2 <- df2 %>% mutate(id = row_number())
(df_unmatch <- dplyr::anti_join(df2,df1,by = c("var1","var2","var3")))
#> # A tibble: 2 x 5
#> bc var1 var2 var3 id
#> <chr> <dbl> <dbl> <dbl> <int>
#> 1 B 4 5 -1 4
#> 2 A -1 2 3 5
# list of non-match are the ids of df_unmatch
df_unmatch$id
#> [1] 4 5
# with that you can create a new variable in df two,for instance
df2 %>% mutate(correct_match = if_else( !(id %in% df_unmatch$id),TRUE,FALSE))
#> # A tibble: 6 x 6
#> bc var1 var2 var3 id correct_match
#> <chr> <dbl> <dbl> <dbl> <int> <lgl>
#> 1 A 1 2 3 1 TRUE
#> 2 B 4 5 6 2 TRUE
#> 3 A 1 2 3 3 TRUE
#> 4 B 4 5 -1 4 FALSE
#> 5 A -1 2 3 5 FALSE
#> 6 B 4 5 6 6 TRUE
由 reprex package (v2.0.0) 于 2021 年 6 月 12 日创建