数据挖掘-实验-FP-growth

  1. 1. 关联规则之FP-growth算法
    1. 1.1. 建立FP树的类定义
    2. 1.2. FP树构建函数
    3. 1.3. 制造一些数据
    4. 1.4. 构造成 element : count 的形式,如 z:5, x:3
    5. 1.5. 抽取条件模式基需要用到的函数
    6. 1.6. 创建条件FP树,挖掘树中的信息
    7. 1.7. 跑起来
    8. 1.8. 说明一下

关联规则之FP-growth算法

建立FP树的类定义

复制下面7个步骤的代码到一个python文件中

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue # 存放节点名字
self.count = numOccur # 存放计数值
self.nodeLink = None # 链接相似的元素项
self.parent = parentNode # 指向父节点
self.children = {} # 存放子节点

def inc(self, numOccur):
self.count += numOccur

def disp(self, ind=1):
print(' '*ind, self.name, ' ', self.count)
for child in self.children.values():
child.disp(ind+1)

FP树构建函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def createFPTree(dataSet, minSup=1):
headerTable = {}
for trans in dataSet:
for item in trans:
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
for k in list(headerTable.keys()):
if headerTable[k] < minSup:
del(headerTable[k]) # 删除不满足最小支持度的元素
freqItemSet = set(headerTable.keys()) # 满足最小支持度的频繁项集
if len(freqItemSet) == 0:
return None, None
for k in headerTable:
headerTable[k] = [headerTable[k], None] # element: [count, node]

retTree = treeNode('Null Set', 1, None)
for tranSet, count in dataSet.items():
# dataSet:[element, count]
localD = {}
for item in tranSet:
if item in freqItemSet: # 过滤,只取该样本中满足最小支持度的频繁项
localD[item] = headerTable[item][0] # element : count
if len(localD) > 0:
# 根据全局频数从大到小对单样本排序
orderedItem = [v[0] for v in sorted(localD.items(), key=lambda p:p[1], reverse=True)]
# 用过滤且排序后的样本更新树
updateFPtree(orderedItem, retTree, headerTable, count)
return retTree, headerTable

def updateHeader(nodeToTest, targetNode):
while nodeToTest.nodeLink != None:
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def updateFPtree(items, inTree, headerTable, count):
if items[0] in inTree.children:
# 判断items的第一个结点是否已作为子结点
inTree.children[items[0]].inc(count)
else:
# 创建新的分支
inTree.children[items[0]] = treeNode(items[0], count, inTree)
# 更新相应频繁项集的链表,往后添加
if headerTable[items[0]][1] == None:
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
# 递归
if len(items) > 1:
updateFPtree(items[1::], inTree.children[items[0]], headerTable, count)

制造一些数据

1
2
3
4
5
6
7
8
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat

构造成 element : count 的形式,如 z:5, x:3

1
2
3
4
5
6
7
8
9
def createInitSet(dataSet):
retDict={}
for trans in dataSet:
key = frozenset(trans)
if key in retDict:
retDict[frozenset(trans)] += 1
else:
retDict[frozenset(trans)] = 1
return retDict

抽取条件模式基需要用到的函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# 递归回溯树,从本节点一直往上
def ascendTree(leafNode,prefixPath):
if leafNode.parent != None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent,prefixPath)

# 找到需查找元素的所有前缀路径
def findPrefixPath(basePat,treeNode):
condPats={}
while treeNode != None:
prefixPath=[]
ascendTree(treeNode,prefixPath)
if len(prefixPath)>1:
condPats[frozenset(prefixPath[1:])]=treeNode.count
treeNode=treeNode.nodeLink
return condPats

创建条件FP树,挖掘树中的信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def mineTree(inTree,headerTable,minSup,preFix,freqItemList):
# 此处原文有错误
# 从头指针的底端开始
bigL=[v[0] for v in sorted(headerTable.items(),key=lambda p:p[0])]
for basePat in bigL:
newFreqSet=preFix.copy()
newFreqSet.add(basePat)
freqItemList.append(newFreqSet)
# 创建条件基
condPattBases=findPrefixPath(basePat,headerTable[basePat][1])
# 创造条件fp树
myCondTree,myHead=createFPTree(condPattBases,minSup)
# 递归找到满足阈值的项
# myHead==None 时,说明树只有一层叶节点,即频繁次项为一元
if myHead!=None:
print('conditional tree for:',newFreqSet)
myCondTree.disp(1)
mineTree(myCondTree,myHead,minSup,newFreqSet,freqItemList)

跑起来

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
print("构建FP树:")
min_sup = 3 # 最小支持度3
simDat = loadSimpDat()
initSet = createInitSet(simDat)
myFPtree, myHeaderTab = createFPTree(initSet, min_sup)
myFPtree.disp()

print("================")
print("抽取条件模式基:")
for item in list(myHeaderTab.keys()):
print(findPrefixPath(item,myHeaderTab[item][1]))

print("================")
print("挖掘出频繁项集:")
freqItems=[]
c=mineTree(myFPtree,myHeaderTab,3,set([]),freqItems)
print(freqItems)

说明一下

对于这颗构建的FP树

它其实是这个意思

要反应的是下面这样的树结构

打印出来的条件模式基

其实反应的就是说的这个表

最后,这里打印出来的是频繁项集