问题描述
我正在尝试使用paper
的事务减少来实现更快的Apriori算法我发现大熊猫crosstab对于频繁设置物品很方便。我正在寻找改进代码的建议:使项目集的数量通用,以便在有数百个项目集的情况下可扩展。
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from itertools import chain
from collections import OrderedDict
from itertools import combinations
import time
start_time = time.time()
'''
corpus =['I1,I2,I5','I2,I4',I3','I1,I3,I3']
'''
corpus=['I1,'I3,I4']
corpus=['I1,I2',I4']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
# print(vectorizer.get_feature_names())
index_col=['T'+str(i+1) for i in range(len(corpus))]
# print(index_col)
# print(X.toarray())
# print(index_col)
# print(X.toarray())
columns=vectorizer.get_feature_names()
C={}
L={}
candidates=None
min_sup=2
min_conf=30.0
def getMinSupport(my_table,min_sup,candidates_temp):
if candidates_temp is None:
df_L=my_table.loc[:,(my_table .sum(axis=0) >= min_sup)] #Sum of columns is greater than minimum support
df_L=df_L[df_L.sum(axis=1) >= min_sup] #Sum of rows is greater than minimum support
return df_L
else:
rows_singleton=my_table.index.to_list() #Get min support rows
cols_singleton=L[1].columns.to_list() # Get min support singleton columns
# print('rows_singleton ',rows_singleton)
#print('cols_singleton ',cols_singleton)
new_df=L[1][cols_singleton].copy()
new_df=new_df[new_df.index.isin(my_table.index)]
#print('new_df')
#print(new_df)
for itemset in candidates_temp:
combination_string = ",".join(itemset)
itemset_as_list=[]
for item in itemset:
itemset_as_list.append(item)
# print(itemset_as_list)
new_df[combination_string] = new_df[itemset_as_list].all(axis=1)
##print('Before dropping singleton columns')
#print(new_df)
new_df.drop(cols_singleton,axis=1,inplace=True)
#print('After dropping singleton columns')
#print(new_df)
df_L=new_df.loc[:,(new_df .sum(axis=0) >= min_sup)] #Sum of columns is greater than minimum support
#print('Sum of columns is greater than minimum support')
#print(df_L)
if len(df_L.columns) ==1 :
return df_L
df_L=df_L[df_L.sum(axis=1) >= min_sup] #Sum of rows is greater than minimum support
#print('Sum of rows is greater than minimum support')
#print(df_L)
return df_L
# python3 program to remove duplicate
# tuples from list of tuples
def removeDuplicates(lst):
return [t for t in (set(tuple(i) for i in lst))]
def getCandidateSets(arr,tuple_size,iteration):
temp_itemtuple_list=[]
temp=list(combinations(arr,2))
for item in temp:
mylist=(item[0]+',' +item[1]).split(',')
t=sorted(list(dict.fromkeys(mylist))) #Order the item list lexigraphically
if len(t) == iteration+1:
valid_tuple=t
temp_itemtuple_list.append(valid_tuple)
temp_itemtuple_list=removeDuplicates(temp_itemtuple_list) #Remove duplicate tuples from the list
return temp_itemtuple_list
k=1
C[k]=pd.DataFrame(data=X.toarray(),index=index_col,columns=columns)
#print(C[k])
L[k]=getMinSupport(C[k],candidates)
#print(L[k])
while (len(L[k].index) !=0):
candidates=getCandidateSets(L[k].columns.to_list(),2,k)
# print(candidates)
k=k+1
L[k]=getMinSupport(L[k-1],candidates)
#print(L[k])
print('Terminated generaion of itemsets')
#print(L[k-1])
"""**Rule generation**
Get proper subsets
"""
def powerset(arr):
itemset4rules_dict={}
tuple_size=len(arr)
for i in range(1,tuple_size+1):
itemset4rules_dict[i]= list(combinations(arr,i))
return itemset4rules_dict
L[k-1].columns.to_list()
for col in L[k-1].columns.to_list():
lst=col.split(',')
itemset4rules_dict=powerset(lst)
"""Find support for each itemset"""
def getSupport(list_itemsets,itemset_size):
#itemset_size is k
s=L[itemset_size][[c for c in L[itemset_size].columns if c in list_itemsets]].reset_index().melt('index')
temp=pd.crosstab(index=s['index'],columns=s.variable,values=s.value,aggfunc='sum',margins=True)
return temp.iloc[-1][:-1].to_list()
support_dictionary={}
for key,value in itemset4rules_dict.items():
#print(key,' : ',value)
#print('Itemset count :',key)
list_itemsets=[]
for itemset in value:
my_string = ','.join(str(x) for x in itemset)
#print(my_string)
list_itemsets.append(my_string)
#print(list_itemsets)
list_supports=getSupport(list_itemsets,key)
#support_dictionary = dict(zip(list_itemsets,list_supports))
support_dictionary.update(dict(zip(list_itemsets,list_supports)))
# print(support_dictionary)
"""Find the items with required confidence"""
#Confidence = support {I1,I3} / support {I1,I2}
#{1,3} –> ({1,3,5} – {1,3}) means 1 & 3 –> 5
#{1} –> ({1,5} – {1}) means 1 –> 3 & 5
def printRules(valid_rules):
for lhs in valid_rules:
rhs=list(set(superset.split(','))- set(lhs.split(',')))
if len(rhs)==0 :
rhs=superset.split(',')
rhs = " & ".join(rhs)
print(lhs + ' ==> ' + rhs)
superset=list(support_dictionary.keys())[-1]
superset_support=support_dictionary[list(support_dictionary.keys())[-1]] #Last item in dictionary is the superset
valid_rules=[]
#print('Itemset','Confidence')
for key,value in support_dictionary.items():
subset_support=value
confidence=(superset_support/subset_support)*100.0
if confidence >= min_conf:
#print(key,confidence)
valid_rules.append(key)
print("Rules generated with min_sup = %s and min_conf = %s" % (min_sup,min_conf))
printRules(valid_rules)
print("--- %s milli seconds ---" % (time.time() - start_time))
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)