
条件:ML100k.data
注意:程序对列表进行修改,为避免列表索引出现问题,避免使用for i in range(len(data)),而使用for i in data可避免这一问题
- import pickle
- data = []
- with open("ML100k.data", 'r') as file:
- for line in file:
- data.append([int(item) for item in line.strip('\n').split('\t')])
-
- user_dict = {}
- film_dict = {}
- for i in data:
- user = i[0]
- film = i[1]
-
- if user not in user_dict:
- user_dict[user] = [i]
- else:
- user_dict[user].append(i)
- if film not in film_dict:
- film_dict[film] = [i]
- else:
- film_dict[film].append(i)
- # 删掉不活跃用户、冷门电影
- for user in user_dict:
- if len(user_dict[user]) < 5:
- for _ in data:
- if _ in user_dict[user]:
- data.remove(_)
- for film in film_dict:
- if len(film_dict[film]) < 5:
- for _ in data:
- if _ in film_dict[film]:
- data.remove(_)
- # 统计用户数量user_num、电影数量item_num、评分数量rating_num
- user_sum = {}
- item_sum = {}
- for i in data:
- user = i[0]
- item = i[1]
- if user not in user_sum:
- user_sum[user] = [i]
- else:
- user_sum[user].append(i)
- if item not in item_sum:
- item_sum[item] = [i]
- else:
- item_sum[item].append(i)
- print(len(user_sum))
- print(len(item_sum))
- print(len(data))
-
- # 计算稀疏度
- sparsity = len(data)/(len(user_sum)*len(item_sum))
- print(sparsity)
-
- # 统计每个用户的平均评分user_average、每部电影的平均评分item_average、以及全部评分的平均评分global_average.
- user_average = []
- item_average = []
- sorted_user = list(user_sum.keys())
- sorted_item = list(item_sum.keys())
- sorted_item.sort()
- sorted_user.sort()
- for user in sorted_user:
- user_average.append(sum(user_sum[user][2])/len(user_sum[user]))
- for item in sorted_item:
- item_average.append(sum(item_sum[item][2])/len(item_sum[item]))
- # print(user_average)
- # print(item_average)
-
- # 统计所有评分中1~5的分布情况rating_num
- rating_num = [0, 0, 0, 0, 0]
- for i in data:
- rating = i[2]
- rating_num[rating-1] += 1
- print(rating_num)
-
- # 将用户和电影分别从0开始标号,使得用户的最大编号为user_sum-1,电影最大编号为item_sum-1
- user_num = {}
- item_num = {}
- count1, count2 = 0, 0
- for i in data:
- user = i[0]
- item = i[1]
- if user not in user_num:
- user_num[user] = count1
- count1 += 1
- if item not in item_num:
- item_num[item] = count2
- count2 += 1
- i[0] = user_num[user]
- i[1] = item_num[item]