第2关:动手实现Apriori算法
def createC1(dataset):
C1 = set()
for t in dataset:
for item in t:
item_set = frozenset([item])
C1.add(item_set)
return C1
def scanD(D, ck, minsupport):
ssCnt = {}
for tid in D:
for can in ck:
if can.issubset(tid):
if can not in ssCnt.keys():
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = len(D)
reList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minsupport:
reList.insert(0, key)
supportData[key] = support
return reList, supportData
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
L1 = list(Lk[i])[k:-2]
L2 = list(Lk[j])[:k-2]
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet,minSupport):
C1 = createC1(dataSet)
D = list(map(set,dataSet))
L1,supportData = scanD(D,C1,minSupport)
L = [L1]
k = 2
while(len(L[k - 2]) > 0):
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D,Ck,minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L,supportData
第3关:从频繁项集中挖掘关联规则
from utils import apriori, aprioriGen
def calcConf(freqSet, H, supportData, brl, minConf = 0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet - conseq]
if conf >= minConf:
brl.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def ruleFromConseq(freqSet, H, supportData, brl, minConf = 0.7):
m = len(H[0])
if len(freqSet) > m+1:
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supporData, brl, minConf)
if len(Hmp1) > 1:
ruleFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def generateRules(dataset, minsupport, minConf):
'''
生成关联规则,可以使用apriori函数获得数据集中的频繁项集列表与支持度
:param dataset:数据集,类型为list
:param minsupport:最小支持度,类型为float
:param minConf:最小可信度,类型为float
:return:关联规则列表,类型为list
'''
digRuleList = []
L, supportData = apriori(dataset, minsupport)
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if i > 1:
ruleFromConseq(freqSet, H1, supportData, digRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, digRuleList, minConf)
return digRuleList
第4关:超市购物清单关联规则分析
from utils import generateRules
import pandas as pd
def T(x):
m = {'yogurt': 1, 'pork': 2, 'sandwich bags': 3, 'lunch meat': 4, 'all- purpose': 5, 'flour': 6, 'soda': 7, 'butter': 8,
'vegetables': 9, 'beef': 10, 'aluminum foil': 11, 'dinner rolls': 12, 'shampoo': 13, 'mixes': 14, 'soap': 15,
'laundry detergent': 16, 'ice cream': 17, 'toilet paper': 18, 'hand soap': 19, 'waffles': 20, 'cheeses': 21,
'milk': 22, 'dishwashing liquid/detergent': 23, 'individual meals': 24, 'cereals': 25, 'tortillas': 26,
'spaghetti sauce': 27, 'ketchup': 28, 'sandwich loaves': 29, 'poultry': 30, 'bagels': 31, 'eggs': 32, 'juice': 33,
'pasta': 34, 'paper towels': 35, 'coffee/tea': 36, 'fruits': 37, 'sugar': 38}
return m[x]
def aprior_data(data):
basket = []
for id in data['id'].unique():
a = [data['good'][i] for i, j in enumerate(data['id']) if j == id]
basket.append(a)
return basket
def genRules(data_path, min_support, min_conf):
data1 = pd.read_csv(data_path)
data1['good'] = data1['good'].apply(T)
data2 = aprior_data(data1)
rult = generateRules(data2, min_support, min_conf)
return rult
|