如何在 sparklyr 包中运行 FPGrowth


我有数据“li”,我想运行算法 FPGrowth,但我不知道如何


# make fake data
li <- list()
for(i in 1:10) li[[i]] <- make.unique(letters[sample(1:26,sample(5:20,1),rep = T)])

sc <- spark_connect(master = "local",version = "3.0.1")

df <- copy_to(sc,**....??????what should be here??????...**  )
            fp_growth_model <- ml_fpgrowth(df)

一个类似的答案 here 但它不起作用,我收到错误

sc <- spark_connect(master = "local",version = "2.3")
tb <- tibble::tibble(items=c("a b c","a b","c f g","b c"))

df <- copy_to(sc,tb)  %>% 
 mutate(items = split(items,"\\\\s+"))

Error in mutate(.,items = split(items,"\\\\s+")) : 
  Could not find function "mutate"

/// plyr::mutate

df <- copy_to(sc,tb)  %>% 
  plyr::mutate(items = split(items,"\\\\s+"))

Error in sdf_import.default(x,sc,name,memory,repartition,overwrite,: 
  table tb already exists (pass overwrite = TRUE to overwrite)

/// SparkR::mutate

df <- copy_to(sc,tb)  %>% 
  SparkR::mutate(items = split(items,: 
  table tb already exists (pass overwrite = TRUE to overwrite)


上述答案中的代码示例有效。由于 mutate 未加载,您首先收到两个错误。第二个是因为对象 tb 已经加载到 Spark 中。



sc <- spark_connect(master = "local")

tb <- tibble::tibble(items=c("a b c","a b","c f g","b c"))

df <- copy_to(sc,tb)  %>% 
  mutate(items = split(items,"\\\\s+"))

fp_growth_model <- ml_fpgrowth(df)


要对数据集 li 执行 FP-growth,您需要更改格式。 函数 ml_fpgrowth 需要一个 SparkDataFrame,其中有一列包含序列的列表。您不能将带有列表的 R DataFrame 直接传输到 Spark。首先,您创建一个包含作为字符串的序列的 SparkDataFrame,然后使用 mutatesplit 函数生成列表。


> tb_li <- tibble(items=sapply(li,function(x) paste(x,collapse=" ")))
> tb_li
# A tibble: 10 x 1
 1 o s n c j r v k e t n.1 v.1 y z e.1 s.1 y.1 y.2 i
 2 c h z g j i s d n q k g.1 u l o j.1 m            
 3 i i.1 j w u g u.1 f y b e                        
 4 l m r a y y.1 f u o i o.1 z                      
 5 p t f k h v v.1 g p.1 q v.2 r q.1 b d m          
 6 v s y t v.1 y.1 n y.2 w                          
 7 h p l y n c n.1                                  
 8 g c w v z o u e h s j r j.1 l b j.2 v.1          
 9 l t n q n.1 v c h n.2 s o x q.1 w k g o.1 w.1 z  
10 n g j e f p x u w k                              

将数据传输到 Spark 并生成列表:

> df_li <- copy_to(sc,tb_li,overwrite = TRUE) %>%
+   mutate(items = split(items,"\\\\s+"))
> df_li
# Source: spark<?> [?? x 1]
 1 <list [19]>
 2 <list [17]>
 3 <list [11]>
 4 <list [12]>
 5 <list [16]>
 6 <list [9]> 
 7 <list [7]> 
 8 <list [17]>
 9 <list [19]>
10 <list [10]>


> fp_growth_model_li <- ml_fpgrowth(df_li)

> ml_association_rules(fp_growth_model_li)
# Source: spark<?> [?? x 4]
   antecedent consequent confidence  lift
   <list>     <list>          <dbl> <dbl>
 1 <list [4]> <list [1]>          1     2
 2 <list [3]> <list [1]>          1     2
 3 <list [3]> <list [1]>          1     2
 4 <list [3]> <list [1]>          1     2
 5 <list [5]> <list [1]>          1     2
 6 <list [5]> <list [1]>          1     2
 7 <list [3]> <list [1]>          1     2
 8 <list [3]> <list [1]>          1     2
 9 <list [3]> <list [1]>          1     2
10 <list [3]> <list [1]>          1     2
# ... with more rows

> ml_freq_itemsets(fp_growth_model_li)
# Source: spark<?> [?? x 2]
   items       freq
   <list>     <dbl>
 1 <list [1]>     3
 2 <list [2]>     3
 3 <list [3]>     3
 4 <list [2]>     3
 5 <list [1]>     5
 6 <list [2]>     3
 7 <list [3]>     3
 8 <list [3]>     3
 9 <list [4]>     3
10 <list [2]>     4
# ... with more rows


