将文本列转换为R中的向量

问题描述

我想查看文本列是否具有超出“ a”和“ b”指定值的元素

specified_value=c("a","b")

df=data.frame(key=c(1,2,3,4),text=c("a,b,c","a,d","1,2",b")
df_out=data.frame(key=c(1,3),text=c("c","d",NA))

这是我尝试过的：

df=df%>%mutate(text_vector=strsplit(text,split=","),extra=text_vector[which(!text_vector %in% specified_value)])

但这没用，有什么建议吗？

解决方法

我们可以用,用定界符separate_rows分隔'文本'，并按'key'分组，用setdiff和{{获得不在'specified_value'中的元素1}}（paste）在一起，然后进行联接以获取原始数据集中的其他列

toString

使用setdiff。

df$outside <- sapply({
  x <- lapply(strsplit(df$text,","),setdiff,specified_value)
  replace(x,lengths(x) == 0,NA)},paste,collapse=",")
df
#   key  text outside
# 1   1 a,b,c       c
# 2   2   a,d       d
# 3   3   1,2     1,2
# 4   4   a,b      NA

数据：

df <- structure(list(key = c(1,2,3,4),text = c("a,c","a,d","1,2",b")),class = "data.frame",row.names = c(NA,-4L))

specified_value <- c("a","b")

使用stringi :: stri_split_fixed

library(stringi)
!all(stri_split_fixed("a,b",simplify=T)  %in% specified_value) #FALSE
!all(stri_split_fixed("a,simplify=T)  %in% specified_value) #TRUE

使用正则表达式而不用逗号分割数据的选项：

#Collapse the specified_value in one string and remove from text 
df$text1 <- gsub(paste0(specified_value,collapse = "|"),'',df$text)
#Remove extra commas
df$text1 <- gsub('(?<![a-z0-9]),',df$text1,perl = TRUE)
df
#  key  text text1
#1   1 a,c     c
#2   2   a,d     d
#3   3   1,2   1,2
#4   4   a,b

r r string string strsplit vector vector