• python 综合练习


    条件:ML100k.data

    注意:程序对列表进行修改,为避免列表索引出现问题,避免使用for i in range(len(data)),而使用for i in data可避免这一问题

    1. import pickle
    2. data = []
    3. with open("ML100k.data", 'r') as file:
    4. for line in file:
    5. data.append([int(item) for item in line.strip('\n').split('\t')])
    6. user_dict = {}
    7. film_dict = {}
    8. for i in data:
    9. user = i[0]
    10. film = i[1]
    11. if user not in user_dict:
    12. user_dict[user] = [i]
    13. else:
    14. user_dict[user].append(i)
    15. if film not in film_dict:
    16. film_dict[film] = [i]
    17. else:
    18. film_dict[film].append(i)
    19. # 删掉不活跃用户、冷门电影
    20. for user in user_dict:
    21. if len(user_dict[user]) < 5:
    22. for _ in data:
    23. if _ in user_dict[user]:
    24. data.remove(_)
    25. for film in film_dict:
    26. if len(film_dict[film]) < 5:
    27. for _ in data:
    28. if _ in film_dict[film]:
    29. data.remove(_)
    30. # 统计用户数量user_num、电影数量item_num、评分数量rating_num
    31. user_sum = {}
    32. item_sum = {}
    33. for i in data:
    34. user = i[0]
    35. item = i[1]
    36. if user not in user_sum:
    37. user_sum[user] = [i]
    38. else:
    39. user_sum[user].append(i)
    40. if item not in item_sum:
    41. item_sum[item] = [i]
    42. else:
    43. item_sum[item].append(i)
    44. print(len(user_sum))
    45. print(len(item_sum))
    46. print(len(data))
    47. # 计算稀疏度
    48. sparsity = len(data)/(len(user_sum)*len(item_sum))
    49. print(sparsity)
    50. # 统计每个用户的平均评分user_average、每部电影的平均评分item_average、以及全部评分的平均评分global_average.
    51. user_average = []
    52. item_average = []
    53. sorted_user = list(user_sum.keys())
    54. sorted_item = list(item_sum.keys())
    55. sorted_item.sort()
    56. sorted_user.sort()
    57. for user in sorted_user:
    58. user_average.append(sum(user_sum[user][2])/len(user_sum[user]))
    59. for item in sorted_item:
    60. item_average.append(sum(item_sum[item][2])/len(item_sum[item]))
    61. # print(user_average)
    62. # print(item_average)
    63. # 统计所有评分中1~5的分布情况rating_num
    64. rating_num = [0, 0, 0, 0, 0]
    65. for i in data:
    66. rating = i[2]
    67. rating_num[rating-1] += 1
    68. print(rating_num)
    69. # 将用户和电影分别从0开始标号,使得用户的最大编号为user_sum-1,电影最大编号为item_sum-1
    70. user_num = {}
    71. item_num = {}
    72. count1, count2 = 0, 0
    73. for i in data:
    74. user = i[0]
    75. item = i[1]
    76. if user not in user_num:
    77. user_num[user] = count1
    78. count1 += 1
    79. if item not in item_num:
    80. item_num[item] = count2
    81. count2 += 1
    82. i[0] = user_num[user]
    83. i[1] = item_num[item]

     

  • 相关阅读:
    C语言基本算法之选择排序
    mysql返回值concat函数拼接,if函数,CASE WHEN函数条件判断。
    Day46 动态规划 part08
    【DaVinci Developer工具实战】03 -导入xml文件
    新华三与中国移动完成IPv6随流检测互通测试
    SuperBuilder的用法,此时不要用Builder
    鸿鹄工程项目管理系统 Spring Cloud+Spring Boot+Mybatis+Vue+ElementUI+前后端分离构建工程项目管理系统
    git 常用命令
    设计模式:命令模式(C++实现)
    springboot遇到的错误
  • 原文地址:https://blog.csdn.net/Rhett_Butler0922/article/details/132811902