python 电影分析
# -*- coding: utf-8 -*-
"""
Apriori exercise.
Created on Sun Oct 26 11:09:03 2017
@author: FWW
"""
import time
# ----222---
def createC1(dataSet):
'''
构建初始候选项集的列表,即所有候选项集只包含一个元素,
C1是大小为1的所有候选项集的集合
'''
C1 = []
for transaction in dataSet:
for item in transaction:
if [item] not in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
# ----333---
def scanD(D, Ck, minSupport):
'''
计算Ck中的项集在事务集合D的每个transactions中的支持度,
返回满足最小支持度的项集的集合,和所有项集支持度信息的字典。
'''
ssCnt = {}
for tid in D:
# 对于每一条transaction
for can in Ck:
# 对于每一个候选项集can,检查是否是transaction的一部分
# 即该候选can是否得到transaction的支持
if can.issubset(tid):
ssCnt[can] = ssCnt.get(can, 0) + 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
# 每个项集的支持度
support = ssCnt[key] / numItems
# 将满足最小支持度的项集,加入retList
if support >= minSupport:
retList.insert(0, key)
# 汇总支持度数据
supportData[key] = support
return retList, supportData
# 444
# Aprior算法
def aprioriGen(Lk, k):
'''
由初始候选项集的集合Lk生成新的生成候选项集,
k表示生成的新项集中所含有的元素个数
'''
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[: k - 2];
L2 = list(Lk[j])[: k - 2];
L1.sort();
L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
# ------------------------1111----
def apriori(dataSet, minSupport=0.5):
# 构建初始候选项集C1
C1 = createC1(dataSet)
# 将dataSet集合化,以满足scanD的格式要求
D = list(map(set, dataSet))
# 构建初始的频繁项集,即所有项集只有一个元素
L1, suppData = scanD(D, C1, minSupport)
L = [L1]
# 最初的L1中的每个项集含有一个元素,新生成的
# 项集应该含有2个元素,所以 k=2
k = 2
while (len(L[k - 2]) > 0):
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minSupport)
# 将新的项集的支持度数据加入原来的总支持度字典中
suppData.update(supK)
# 将符合最小支持度要求的项集加入L
L.append(Lk)
# 新生成的项集中的元素个数应不断增加
k += 1
# 返回所有满足条件的频繁项集的列表,和所有候选项集的支持度信息
return L, suppData
# 777
def calcConf(freqSet, H, supportData, brl, minConf=0.5):
'''
计算规则的可信度,返回满足最小可信度的规则。
freqSet(frozenset):频繁项集
H(frozenset):频繁项集中所有的元素
supportData(dic):频繁项集中所有元素的支持度
brl(tuple):满足可信度条件的关联规则
minConf(float):最小可信度
'''
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet - conseq]
if conf >= minConf:
# print (freqSet - conseq, '-->', conseq, 'conf:', conf)
brl.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# 666
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.5):
'''
对频繁项集中元素超过2的项集进行合并。
freqSet(frozenset):频繁项集
H(frozenset):频繁项集中的所有元素,即可以出现在规则右部的元素
supportData(dict):所有项集的支持度信息
brl(tuple):生成的规则
'''
m = len(H[0])
if m == 1:
calcConf(freqSet, H, supportData, brl, minConf)
# 查看频繁项集是否大到移除大小为 m 的子集
if len(freqSet) > m + 1:
Hmp1 = aprioriGen(H, m + 1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, brl, minConf)
# 如果不止一条规则满足要求,进一步递归合并
if len(Hmp1) > 1:
rulesFromConseq(freqSet, Hmp1, supportData, brl, minConf)
def recommendMovies(rules, personal_list, movie_list):
recommend_list = []
sup_list = []
for rule in rules:
if rule[0] <= personal_list:
for movie in rule[1]:
if movie_list[movie - 1] not in recommend_list:
recommend_list.append(movie_list[movie - 1])
sup_list.append(rule[2])
for recommend in recommend_list:
i = recommend_list.index(recommend)
print('Recommend you to watch', recommend, ',', round(sup_list[i] * 100, 2),
'% people who is similar to you like it!')
def generateRules(L, supportData, minConf=0.5):
'''
根据频繁项集和最小可信度生成规则。
L(list):存储频繁项集
supportData(dict):存储着所有项集(不仅仅是频繁项集)的支持度
minConf(float):最小可信度
'''
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
# 对于每一个频繁项集的集合freqSet
H1 = [frozenset([item]) for item in freqSet]
# 如果频繁项集中的元素个数大于2,需要进一步合并
if i > 1:
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
def convert_to_int(value):
try:
return int(value)
except (ValueError, TypeError):
return 0 # 或者根据需求返回其他默认值
if __name__ == '__main__':
# 导入数据集
start_time = time.time()
file_object = open('ratings.dat')
movies_object = open('movies.dat')
personal_object = open('personalRatings.txt')
file_list = []
all_the_text = file_object.read()
origin_list = (line.split('::') for line in all_the_text.split('\n'))
tem_list = []
for line in origin_list:
# print(line[0])
if line[0] != '':
if len(file_list) < convert_to_int(line[0]):
file_list.append(tem_list)
tem_list = []
else:
# print("ta")
# print(line[0])
if int(line[2]) > 3:
tem_list.append(int(line[1]))
with open('movies.dat', 'r', encoding='latin1', errors='ignore') as movies_object:
movies_text = movies_object.read()
movies_list = []
for item in (line.split('::') for line in movies_text.split('\n')):
# print(item) #空处理
if item and len(item)>2: # 检查是否为空行以及包含足够的元素
# print(item) # 处理非空的情况
# print(len(item))
if item[1] not in movies_list:
movies_list.append(item[1])
personal_text = personal_object.read()
personal_list = []
for item in (line.split('::') for line in personal_text.split('\n')):
if item and len(item) > 2: # 检查是否为空行以及包含足够的元素
if int(item[2]) > 3:
personal_list.append(int(item[1]))
file_object.close()
movies_object.close()
personal_object.close()
print('Read file sucess in', time.time() - start_time, 's')
# 选择频繁项集
L, suppData = apriori(file_list, 0.2)
rules = generateRules(L, suppData, minConf=0.5)
# print ('rules:\n', rules)
print('Caculate rules success in', time.time() - start_time, 's')
recommendMovies(rules, frozenset(personal_list), movies_list)
print('The program completes in', time.time() - start_time, 's')