• 动手学深度学习(五)Kaggle房价预测


    Kaggle房价数据集,前四个为房价特征,最后一个为标签(房价)。

    一、下载数据集

    1. import numpy as np
    2. import pandas as pd
    3. import torch
    4. from torch import nn
    5. from d2l import torch as d2l
    6. import hashlib
    7. import os
    8. import tarfile
    9. import zipfile
    10. import requests
    11. # 数据集下载
    12. DATA_HUB = dict()
    13. DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
    14. def download(name, cache_dir=os.path.join('.', 'data')): # @save
    15. """下载一个DATA_HUB中的文件,返回本地文件名"""
    16. assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    17. url, sha1_hash = DATA_HUB[name]
    18. os.makedirs(cache_dir, exist_ok=True)
    19. fname = os.path.join(cache_dir, url.split('/')[-1])
    20. if os.path.exists(fname):
    21. sha1 = hashlib.sha1()
    22. with open(fname, 'rb') as f:
    23. while True:
    24. data = f.read(1048576)
    25. if not data:
    26. break
    27. sha1.update(data)
    28. if sha1.hexdigest() == sha1_hash:
    29. return fname # 命中缓存
    30. print(f'正在从{url}下载{fname}...')
    31. r = requests.get(url, stream=True, verify=True)
    32. with open(fname, 'wb') as f:
    33. f.write(r.content)
    34. return fname
    35. def download_extract(name, folder=None): # @save
    36. """下载并解压zip/tar文件"""
    37. fname = download(name)
    38. base_dir = os.path.dirname(fname)
    39. data_dir, ext = os.path.splitext(fname)
    40. if ext == '.zip':
    41. fp = zipfile.ZipFile(fname, 'r')
    42. elif ext in ('.tar', '.gz'):
    43. fp = tarfile.open(fname, 'r')
    44. else:
    45. assert False, '只有zip/tar文件可以被解压缩'
    46. fp.extractall(base_dir)
    47. return os.path.join(base_dir, folder) if folder else data_dir
    48. def download_all(): # @save
    49. """下载DATA_HUB中的所有文件"""
    50. for name in DATA_HUB:
    51. download(name)
    52. DATA_HUB['kaggle_house_train'] = (
    53. DATA_URL + 'kaggle_house_pred_train.csv',
    54. '585e9cc93e70b39160e7921475f9bcd7d31219ce')
    55. DATA_HUB['kaggle_house_test'] = (
    56. DATA_URL + 'kaggle_house_pred_test.csv',
    57. 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
    58. train_data = pd.read_csv(download('kaggle_house_train'))
    59. test_data = pd.read_csv(download('kaggle_house_test')) # 读表

    查看数据集大小和部分样本:

    1. print(train_data.shape)
    2. print(test_data.shape)
    3. print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

    (1460, 81)
    (1459, 80)


       Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
    0   1          60       RL         65.0       WD        Normal     208500
    1   2          20       RL         80.0       WD        Normal     181500
    2   3          60       RL         68.0       WD        Normal     223500
    3   4          70       RL         60.0       WD       Abnorml     140000

    二、数据预处理

    1. """ 数据预处理 """
    2. all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:])) # 去掉id列
    3. # 将所有缺失的值替换为相应特征的平均值。通过将特征重新缩放到零均值和单位方差来标准化数据
    4. numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
    5. all_features[numeric_features] = all_features[numeric_features].apply(
    6. lambda x: (x - x.mean()) / (x.std())) # 标准化,将所有特征的均值变为0和方差变为1
    7. all_features[numeric_features] = all_features[numeric_features].fillna(0) # 将缺失项设置为0
    8. # “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
    9. all_features = pd.get_dummies(all_features, dummy_na=True) # 为离散值生成独热编码,并增加一列表示空缺值
    10. # 从pandas格式中提取NumPy格式,并将其转换为张量表示
    11. n_train = train_data.shape[0]
    12. train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
    13. test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
    14. train_labels = torch.tensor(
    15. train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

     查看特征总数大小:

    print(all_features.shape)

    (2919, 331) 

    可以看到经过数据预处理会将特征总数由79增加到331。

    三、训练函数

    房价就像股票价格一样,我们关心的是相对误差,而不是绝对误差。比如说,农村的房价原本为12.5万,误差10万,和在市中心豪宅区的房价原本为420万,误差10万,显然使用绝对误差对结果评估的影响是不一样的,我们希望使用一种误差测量方法不受样本大小波动的影响,预测昂贵房屋和廉价房屋的误差能够同等影响预测结果,因此需要使用相对误差的测量方法,我们采用均方根损失来测量房价预测的相对误差。

    1. """ 训练 """
    2. loss = nn.MSELoss()
    3. in_features = train_features.shape[1] # 输入特征总数为331
    4. def get_net():
    5. net = nn.Sequential(nn.Linear(in_features, 1))
    6. return net
    7. def log_rmse(net, features, labels):
    8. # 为了在取对数时进一步稳定该值,将小于1的值设置为1
    9. clipped_preds = torch.clamp(net(features), 1, float('inf'))
    10. rmse = torch.sqrt(loss(torch.log(clipped_preds),
    11. torch.log(labels)))
    12. return rmse.item()

    均方根损失函数

    1. # 均方根损失
    2. def log_rmse(net, features, labels):
    3. # 为了在取对数时进一步稳定该值,将小于1的值设置为1
    4. clipped_preds = torch.clamp(net(features), 1, float('inf'))
    5. rmse = torch.sqrt(loss(torch.log(clipped_preds),
    6. torch.log(labels)))
    7. return rmse.item()

    训练函数

    训练函数使用Adam优化器。

    1. # 训练函数
    2. def train(net, train_features, train_labels, test_features, test_labels,
    3. num_epochs, learning_rate, weight_decay, batch_size):
    4. train_ls, test_ls = [], []
    5. train_iter = d2l.load_array((train_features, train_labels), batch_size) # 加载训练数据
    6. # 这里使用的是Adam优化算法
    7. optimizer = torch.optim.Adam(net.parameters(),
    8. lr = learning_rate,
    9. weight_decay = weight_decay)
    10. for epoch in range(num_epochs):
    11. for X, y in train_iter:
    12. optimizer.zero_grad()
    13. l = loss(net(X), y)
    14. l.backward()
    15. optimizer.step()
    16. train_ls.append(log_rmse(net, train_features, train_labels))
    17. if test_labels is not None:
    18. test_ls.append(log_rmse(net, test_features, test_labels))
    19. return train_ls, test_ls

     

    四、K折交叉验证(可选,炼丹步骤)

    1. def get_k_fold_data(k, i, X, y):
    2. assert k > 1
    3. fold_size = X.shape[0] // k
    4. X_train, y_train = None, None
    5. for j in range(k):
    6. idx = slice(j * fold_size, (j + 1) * fold_size)
    7. X_part, y_part = X[idx, :], y[idx]
    8. if j == i:
    9. X_valid, y_valid = X_part, y_part
    10. elif X_train is None:
    11. X_train, y_train = X_part, y_part
    12. else:
    13. X_train = torch.cat([X_train, X_part], 0)
    14. y_train = torch.cat([y_train, y_part], 0)
    15. return X_train, y_train, X_valid, y_valid

     当我们在K折交叉验证中训练K次后,返回训练和验证误差的平均值

    1. def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
    2. batch_size):
    3. train_l_sum, valid_l_sum = 0, 0 # 用于存储训练误差和验证误差的总和
    4. for i in range(k):
    5. data = get_k_fold_data(k, i, X_train, y_train)
    6. net = get_net() #选择模型
    7. train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
    8. weight_decay, batch_size) # 训练模型
    9. train_l_sum += train_ls[-1] # 将当前训练误差的最后一个值累加到train_l_sum变量中
    10. valid_l_sum += valid_ls[-1]
    11. if i == 0: # 第一次循环
    12. d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
    13. xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
    14. legend=['train', 'valid'], yscale='log')
    15. print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
    16. f'验证log rmse{float(valid_ls[-1]):f}')
    17. return train_l_sum / k, valid_l_sum / k

    五、模型选择(可选,炼丹步骤)

    不断的更换超参数,保留最优的超参数。

    1. k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
    2. train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
    3. weight_decay, batch_size)
    4. print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
    5. f'平均验证log rmse: {float(valid_l):f}')

    六、训练

    1. """ 训练与预测 """
    2. def train_and_pred(train_features, test_features, train_labels, test_data,
    3. num_epochs, lr, weight_decay, batch_size):
    4. net = get_net()
    5. train_ls, _ = train(net, train_features, train_labels, None, None,
    6. num_epochs, lr, weight_decay, batch_size)
    7. d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
    8. ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    9. print(f'训练log rmse:{float(train_ls[-1]):f}')
    10. # 将网络应用于测试集。
    11. preds = net(test_features).detach().numpy()
    12. # 将其重新格式化以导出到Kaggle
    13. test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    14. submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    15. submission.to_csv('submission.csv', index=False)
    16. k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
    17. train_and_pred(train_features, test_features, train_labels, test_data,
    18. num_epochs, lr, weight_decay, batch_size)

  • 相关阅读:
    注解配置SpringMVC
    这个开学季,注定不平凡
    Linux红帽(RHCE)认证学习笔记
    【OpenCV 例程 300篇】248. 特征描述之HOG描述符
    ComfyUI生成视频时,K采样器就一直报错
    java源码系列:HashMap源码验证,自己手写一个HashMap之02-写put方法以及思路、哈希冲突等
    Kali Linux源
    Botowski:SEO友好的AI内容生成器
    Python list列表删除元素(4种方法)
    Java (JVM) 内存模型
  • 原文地址:https://blog.csdn.net/qq_45981086/article/details/132651657