就是频繁项集挖掘,FP-Growth算法。
先产生headerTable:
数据结构(其实也是调了好几次代码才确定的,因为一开始总有想不到的东西):entry: entry: {authorName: frequence,firstChildPointer,startYear,endYear}
def CreateHeaderTable(tranDB,minSupport=1): headerTable={} #entry: entry: {authorName: frequence,endYear} authorDB={} #entry: {frozenset(authorListSet): frequence} for i,(conf,year,authorList) in enumerate(tranDB): authorListSet=set([]) print "trans",i,"=="*20 if conf is np.nan or year is np.nan or authorList is np.nan: continue #for tranDB[2426,:] for author in authorList.split("|"): authorListSet.add(author) if headerTable.has_key(author): headerTable[author][0]+=1 if year<headerTable[author][2]: headerTable[author][2]=year elif year>headerTable[author][3]: headerTable[author][3]=year else: headerTable[author]=[1,None,year] if authorDB.has_key(frozenset(authorListSet)): authorDB[frozenset(authorListSet)]+=1 else: authorDB[frozenset(authorListSet)]=1 for author in headerTable.keys(): if headerTable[author][0]<minSupport: del headerTable[author] return headerTable,authorDB
再构建FP-Tree:
每个treeNode又五元组来描述:
class TREE_NODE: def __init__(self,authorName,frequence,parentPointer): self.authorName=authorName self.frequence=frequence self.parentPointer=parentPointer #parent TREE_NODE self.childrenPointer={} #children TREE_NODEs dict self.brotherPointer=None #brother TREE_NODE注意,每次加入节点,这个五元组的每一项是否都考虑了;
如果是新节点,是否考虑链接到headerTable了;
对,只有这两点;代码如下:
''' headerTable={} #entry: {authorName: frequence,endYear} if we want to call UpdateCondTree(),then headerTable==>condHeaderTable={} #entry: {authorName: frequence,firstChildPointer} #TREE_NODE: (self,parentPointer,childrenPointer={},brotherPointer=None) ''' def UpdateTree(authorsList,treeNode,headerTable,frequence): if treeNode.childrenPointer.has_key(authorsList[0]): treeNode.childrenPointer[authorsList[0]].frequence+=frequence #isn't +1 else: #add authorsList[0] as a new child of treeNode treeNode.childrenPointer[authorsList[0]]=TREE_NODE(authorsList[0],treeNode) if headerTable[authorsList[0]][1]==None: #[1] is the firstChildPointer headerTable[authorsList[0]][1]=treeNode.childrenPointer[authorsList[0]] else: firstChildPointer=headerTable[authorsList[0]][1] tempAuthorNode=firstChildPointer while tempAuthorNode.brotherPointer is not None: tempAuthorNode=tempAuthorNode.brotherPointer tempAuthorNode.brotherPointer=treeNode.childrenPointer[authorsList[0]] if len(authorsList)>1: #recursively call UpdateTree() with authorsList[1:] #UpdateTree(authorsList[1:],frequence) UpdateTree(authorsList[1:],treeNode.childrenPointer[authorsList[0]],frequence) #do care this! ''' headerTable={} #entry: {authorName: frequence,endYear} if we want to call CreateCondTree(),firstChildPointer} authorDB={} #entry: {frozenset(authorListSet): frequence} #TREE_NODE: (self,brotherPointer=None) ''' def CreateTree(authorDB,headerTable): #same function as CreateCondTree() treeRoot=TREE_NODE("NULL",None)#root Node of FPtree for i,(authorListSet,frequence) in enumerate(authorDB.items()): print "authorListSet","=="*20 tempDict={} for author in authorListSet: if headerTable.has_key(author): tempDict[author]=headerTable[author][0] if len(tempDict)>0: tempList=sorted(tempDict.items(),key=lambda x:x[1],reverse=True) authorsList=[author for author,count in tempList] UpdateTree(authorsList,treeRoot,frequence) return treeRoot
注意,每一步验证一下:
#secondly,create the FP-Tree(the second pass) treeRoot=CreateTree(authorDB,headerTable) headerTable["Ying Wu"][1].authorName #Ying Wu headerTable["Ying Wu"][1].frequence #47,so 4=51-47 in other brotherPointer TREE_NODE print len(treeRoot.childrenPointer) #4028 < 7318=len(headerTable) print treeRoot.childrenPointer[treeRoot.childrenPointer.keys()[0]].authorName #Linli Xu print treeRoot.childrenPointer[treeRoot.childrenPointer.keys()[0]].frequence #2 < 12=headerTable["Linli Xu"][0],so 10=12-2 in other brotherPointer TREE_NODE print treeRoot.childrenPointer[treeRoot.childrenPointer.keys()[0]].parentPointer.authorName #NULL,that's the root!
挖掘FP-Tree,找出频繁项:
思路:从支持度最小的单项集出发,依次递归挖掘支持度更大的单项集。
对每一个单项集的挖掘,思路如下:
找headerTable的每一个孩子结点,递归寻找这个孩子结点的条件子树;
然后合并每个孩子结点找到的条件子树,构成一颗关于该单项集的条件子树,递归挖掘该单项集的条件子树,从而形成两个、三个、四个、、、项集的频繁模式;
对所有生成的这些两个、三个、四个、、、项集的频繁模式,递归上面的过程即可。
def FindParentTreeNodes(baseTreeNode): parentTreeNodes=[] while baseTreeNode.parentPointer is not None: #while baseTreeNode is not the ROOT node whose parentPointer is None and authorName is "NULL" parentTreeNodes.append(baseTreeNode.authorName) baseTreeNode=baseTreeNode.parentPointer return parentTreeNodes def FindCondAuthorDB(firstChildPointerTreeNode): condAuthorDB={} #entry: {frozenset(authorListSet): frequence} tempTreeNode=firstChildPointerTreeNode while tempTreeNode is not None: parentTreeNodes=FindParentTreeNodes(tempTreeNode) if len(parentTreeNodes)>1: condAuthorDB[frozenset(parentTreeNodes[1:])]=tempTreeNode.frequence #parentTreeNodes[1:],remove self treeNode tempTreeNode=tempTreeNode.brotherPointer return condAuthorDB def CreateCondHeaderTable(condAuthorDB,minSupport=1): condHeaderTable={} #entry: {authorName: frequence,firstChildPointer} for i,frequence) in enumerate(condAuthorDB.items()): print "cond trans","=="*20 for author in authorListSet: if condHeaderTable.has_key(author): headerTable[author][0]+=frequence else: condHeaderTable[author]=[frequence,None] for author in condHeaderTable.keys(): if condHeaderTable[author][0]<minSupport: del condHeaderTable[author] return condHeaderTable ''' headerTable={} #entry: {authorName: frequence,endYear} if we want to call mineCondTree(),brotherPointer=None) ''' def mineTree(treeRoot,minSupport=1,baseFreqAuthorSet=set([]),finalFreqAuthorPattDict={}): sortedAuthorsList=[value[0] for value in sorted(headerTable.items(),reverse=False)] for baseAuthor in sortedAuthorsList: #start from bottom of headerTable newFreqAuthorSet=baseFreqAuthorSet.copy() newFreqAuthorSet.add(baseAuthor) finalFreqAuthorPattDict[frozenset(newFreqAuthorSet)]=headerTable[baseAuthor][0] condAuthorDB=FindCondAuthorDB(headerTable[baseAuthor][1]) #[1] is the firstChildPointer condHeaderTable=CreateCondHeaderTable(condAuthorDB,minSupport) condTreeRoot=CreateTree(condAuthorDB,condHeaderTable) #call CreateCondTree() if condHeaderTable is not None: mineTree(condTreeRoot,condHeaderTable,minSupport,newFreqAuthorSet,finalFreqAuthorPattDict) #call mineCondTree()