• 第100+15步 ChatGPT学习:R实现Ababoost分类


    基于R 4.2.2版本演示

    一、写在前面

    有不少大佬问做机器学习分类能不能用R语言,不想学Python咯。

    答曰:可!用GPT或者Kimi转一下就得了呗。

    加上最近也没啥内容写了,就帮各位搬运一下吧。

    二、R代码实现Ababoost分类

    (1)导入数据

    我习惯用RStudio自带的导入功能:

    (2)建立Ababoost模型(默认参数)

    1. # Load necessary libraries
    2. library(caret)
    3. library(pROC)
    4. library(ggplot2)
    5. # Assume 'data' is your dataframe containing the data
    6. # Set seed to ensure reproducibility
    7. set.seed(123)
    8. # Split data into training and validation sets (80% training, 20% validation)
    9. trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
    10. trainData <- data[trainIndex, ]
    11. validData <- data[-trainIndex, ]
    12. # Convert the target variable to a factor for classification
    13. trainData$X <- as.factor(trainData$X)
    14. validData$X <- as.factor(validData$X)
    15. # Define control method for training with cross-validation
    16. trainControl <- trainControl(method = "cv", number = 10)
    17. # Fit Random Forest model on the training set
    18. model <- train(X ~ ., data = trainData, method = "ada", trControl = trainControl)
    19. # Print the best parameters found by the model
    20. best_params <- model$bestTune
    21. cat("The best parameters found are:\n")
    22. print(best_params)
    23. # Predict on the training and validation sets
    24. trainPredict <- predict(model, trainData, type = "prob")[,2]
    25. validPredict <- predict(model, validData, type = "prob")[,2]
    26. # Calculate ROC curves and AUC values
    27. trainRoc <- roc(response = trainData$X, predictor = trainPredict)
    28. validRoc <- roc(response = validData$X, predictor = validPredict)
    29. # Plot ROC curves with AUC values
    30. ggplot(data = data.frame(fpr = trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = 1 - fpr, y = tpr)) +
    31. geom_line(color = "blue") +
    32. geom_area(alpha = 0.2, fill = "blue") +
    33. geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
    34. ggtitle("Training ROC Curve") +
    35. xlab("False Positive Rate") +
    36. ylab("True Positive Rate") +
    37. annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
    38. ggplot(data = data.frame(fpr = validRoc$specificities, tpr = validRoc$sensitivities), aes(x = 1 - fpr, y = tpr)) +
    39. geom_line(color = "red") +
    40. geom_area(alpha = 0.2, fill = "red") +
    41. geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
    42. ggtitle("Validation ROC Curve") +
    43. xlab("False Positive Rate") +
    44. ylab("True Positive Rate") +
    45. annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
    46. # Calculate confusion matrices based on 0.5 cutoff for probability
    47. confMatTrain <- table(trainData$X, trainPredict >= 0.5)
    48. confMatValid <- table(validData$X, validPredict >= 0.5)
    49. # Function to plot confusion matrix using ggplot2
    50. plot_confusion_matrix <- function(conf_mat, dataset_name) {
    51. conf_mat_df <- as.data.frame(as.table(conf_mat))
    52. colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
    53. p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
    54. geom_tile(color = "white") +
    55. geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
    56. scale_fill_gradient(low = "white", high = "steelblue") +
    57. labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
    58. theme_minimal() +
    59. theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
    60. print(p)
    61. }
    62. # Now call the function to plot and display the confusion matrices
    63. plot_confusion_matrix(confMatTrain, "Training")
    64. plot_confusion_matrix(confMatValid, "Validation")
    65. # Extract values for calculations
    66. a_train <- confMatTrain[1, 1]
    67. b_train <- confMatTrain[1, 2]
    68. c_train <- confMatTrain[2, 1]
    69. d_train <- confMatTrain[2, 2]
    70. a_valid <- confMatValid[1, 1]
    71. b_valid <- confMatValid[1, 2]
    72. c_valid <- confMatValid[2, 1]
    73. d_valid <- confMatValid[2, 2]
    74. # Training Set Metrics
    75. acc_train <- (a_train + d_train) / sum(confMatTrain)
    76. error_rate_train <- 1 - acc_train
    77. sen_train <- d_train / (d_train + c_train)
    78. sep_train <- a_train / (a_train + b_train)
    79. precision_train <- d_train / (b_train + d_train)
    80. F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
    81. MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
    82. auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
    83. # Validation Set Metrics
    84. acc_valid <- (a_valid + d_valid) / sum(confMatValid)
    85. error_rate_valid <- 1 - acc_valid
    86. sen_valid <- d_valid / (d_valid + c_valid)
    87. sep_valid <- a_valid / (a_valid + b_valid)
    88. precision_valid <- d_valid / (b_valid + d_valid)
    89. F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
    90. MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
    91. auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
    92. # Print Metrics
    93. cat("Training Metrics\n")
    94. cat("Accuracy:", acc_train, "\n")
    95. cat("Error Rate:", error_rate_train, "\n")
    96. cat("Sensitivity:", sen_train, "\n")
    97. cat("Specificity:", sep_train, "\n")
    98. cat("Precision:", precision_train, "\n")
    99. cat("F1 Score:", F1_train, "\n")
    100. cat("MCC:", MCC_train, "\n")
    101. cat("AUC:", auc_train, "\n\n")
    102. cat("Validation Metrics\n")
    103. cat("Accuracy:", acc_valid, "\n")
    104. cat("Error Rate:", error_rate_valid, "\n")
    105. cat("Sensitivity:", sen_valid, "\n")
    106. cat("Specificity:", sep_valid, "\n")
    107. cat("Precision:", precision_valid, "\n")
    108. cat("F1 Score:", F1_valid, "\n")
    109. cat("MCC:", MCC_valid, "\n")
    110. cat("AUC:", auc_valid, "\n")

    在R语言中,使用 caret 包训练Ababoost模型时,最关键的可调参数不多,下面是一些可以调整的关键参数:

    ①Iter: 这是最重要的参数之一,代表弱学习器的数量,即AdaBoost算法中的迭代次数。较大的nIter值通常可以提高模型的复杂度和拟合能力,但也可能导致过拟合。

    ②maxdepth: 这是决策树的最大深度。AdaBoost通常使用决策树作为其弱学习器。通过调整maxdepth可以控制单个决策树的复杂度,从而影响整个集成模型的复杂度。

    ③nu: 这个参数是学习率(也称为收缩参数或步长)。它用于更新每次迭代中模型权重。较小的nu值可以使模型学习得更加谨慎,通常可以减少过拟合的风险,但可能需要更多的迭代次数来收敛。

    结果输出(默认参数):

    在默认参数中,caret包已经默默帮我们吧上面三个参数进行测试和寻优。

    从AUC来看,Ababoost随便一跑,就跑出个不错的结果。不过有些过拟合了,验证集的性能稍微差些。

    三、Ababoost手动调参方法(3个值)

    设置iter值取值50、100、200、400、600;maxdepth取值1、2、5、7和9;nu取值0.01、0.1、0.5:

    1. # Load necessary libraries
    2. library(caret)
    3. library(pROC)
    4. library(ggplot2)
    5. # Assume 'data' is your dataframe containing the data
    6. # Set seed to ensure reproducibility
    7. set.seed(123)
    8. # Split data into training and validation sets (80% training, 20% validation)
    9. trainIndex <- createDataPartition(data$X, p = 0.8, list = FALSE)
    10. trainData <- data[trainIndex, ]
    11. validData <- data[-trainIndex, ]
    12. # Convert the target variable to a factor for classification
    13. trainData$X <- as.factor(trainData$X)
    14. validData$X <- as.factor(validData$X)
    15. # Define control method for training with cross-validation
    16. trainControl <- trainControl(method = "cv", number = 10)
    17. # Define the tuning grid with correct parameter names
    18. tuneGrid <- expand.grid(iter = c(50, 100, 200, 400, 600),
    19. maxdepth = c(1, 2, 5, 7, 9),
    20. nu = c(0.01, 0.1, 0.5))
    21. # Train the model using the ada method and the corrected tuning grid
    22. model <- train(X ~ ., data = trainData, method = "ada", trControl = trainControl, tuneGrid = tuneGrid)
    23. # Print the best parameters found by the model
    24. best_params <- model$bestTune
    25. cat("The best parameters found are:\n")
    26. print(best_params)
    27. # Predict on the training and validation sets
    28. trainPredict <- predict(model, trainData, type = "prob")[,2]
    29. validPredict <- predict(model, validData, type = "prob")[,2]
    30. # Calculate ROC curves and AUC values
    31. trainRoc <- roc(response = trainData$X, predictor = trainPredict)
    32. validRoc <- roc(response = validData$X, predictor = validPredict)
    33. # Plot ROC curves with AUC values
    34. ggplot(data = data.frame(fpr = trainRoc$specificities, tpr = trainRoc$sensitivities), aes(x = 1 - fpr, y = tpr)) +
    35. geom_line(color = "blue") +
    36. geom_area(alpha = 0.2, fill = "blue") +
    37. geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
    38. ggtitle("Training ROC Curve") +
    39. xlab("False Positive Rate") +
    40. ylab("True Positive Rate") +
    41. annotate("text", x = 0.5, y = 0.1, label = paste("Training AUC =", round(auc(trainRoc), 2)), hjust = 0.5, color = "blue")
    42. ggplot(data = data.frame(fpr = validRoc$specificities, tpr = validRoc$sensitivities), aes(x = 1 - fpr, y = tpr)) +
    43. geom_line(color = "red") +
    44. geom_area(alpha = 0.2, fill = "red") +
    45. geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
    46. ggtitle("Validation ROC Curve") +
    47. xlab("False Positive Rate") +
    48. ylab("True Positive Rate") +
    49. annotate("text", x = 0.5, y = 0.2, label = paste("Validation AUC =", round(auc(validRoc), 2)), hjust = 0.5, color = "red")
    50. # Calculate confusion matrices based on 0.5 cutoff for probability
    51. confMatTrain <- table(trainData$X, trainPredict >= 0.5)
    52. confMatValid <- table(validData$X, validPredict >= 0.5)
    53. # Function to plot confusion matrix using ggplot2
    54. plot_confusion_matrix <- function(conf_mat, dataset_name) {
    55. conf_mat_df <- as.data.frame(as.table(conf_mat))
    56. colnames(conf_mat_df) <- c("Actual", "Predicted", "Freq")
    57. p <- ggplot(data = conf_mat_df, aes(x = Predicted, y = Actual, fill = Freq)) +
    58. geom_tile(color = "white") +
    59. geom_text(aes(label = Freq), vjust = 1.5, color = "black", size = 5) +
    60. scale_fill_gradient(low = "white", high = "steelblue") +
    61. labs(title = paste("Confusion Matrix -", dataset_name, "Set"), x = "Predicted Class", y = "Actual Class") +
    62. theme_minimal() +
    63. theme(axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(hjust = 0.5))
    64. print(p)
    65. }
    66. # Now call the function to plot and display the confusion matrices
    67. plot_confusion_matrix(confMatTrain, "Training")
    68. plot_confusion_matrix(confMatValid, "Validation")
    69. # Extract values for calculations
    70. a_train <- confMatTrain[1, 1]
    71. b_train <- confMatTrain[1, 2]
    72. c_train <- confMatTrain[2, 1]
    73. d_train <- confMatTrain[2, 2]
    74. a_valid <- confMatValid[1, 1]
    75. b_valid <- confMatValid[1, 2]
    76. c_valid <- confMatValid[2, 1]
    77. d_valid <- confMatValid[2, 2]
    78. # Training Set Metrics
    79. acc_train <- (a_train + d_train) / sum(confMatTrain)
    80. error_rate_train <- 1 - acc_train
    81. sen_train <- d_train / (d_train + c_train)
    82. sep_train <- a_train / (a_train + b_train)
    83. precision_train <- d_train / (b_train + d_train)
    84. F1_train <- (2 * precision_train * sen_train) / (precision_train + sen_train)
    85. MCC_train <- (d_train * a_train - b_train * c_train) / sqrt((d_train + b_train) * (d_train + c_train) * (a_train + b_train) * (a_train + c_train))
    86. auc_train <- roc(response = trainData$X, predictor = trainPredict)$auc
    87. # Validation Set Metrics
    88. acc_valid <- (a_valid + d_valid) / sum(confMatValid)
    89. error_rate_valid <- 1 - acc_valid
    90. sen_valid <- d_valid / (d_valid + c_valid)
    91. sep_valid <- a_valid / (a_valid + b_valid)
    92. precision_valid <- d_valid / (b_valid + d_valid)
    93. F1_valid <- (2 * precision_valid * sen_valid) / (precision_valid + sen_valid)
    94. MCC_valid <- (d_valid * a_valid - b_valid * c_valid) / sqrt((d_valid + b_valid) * (d_valid + c_valid) * (a_valid + b_valid) * (a_valid + c_valid))
    95. auc_valid <- roc(response = validData$X, predictor = validPredict)$auc
    96. # Print Metrics
    97. cat("Training Metrics\n")
    98. cat("Accuracy:", acc_train, "\n")
    99. cat("Error Rate:", error_rate_train, "\n")
    100. cat("Sensitivity:", sen_train, "\n")
    101. cat("Specificity:", sep_train, "\n")
    102. cat("Precision:", precision_train, "\n")
    103. cat("F1 Score:", F1_train, "\n")
    104. cat("MCC:", MCC_train, "\n")
    105. cat("AUC:", auc_train, "\n\n")
    106. cat("Validation Metrics\n")
    107. cat("Accuracy:", acc_valid, "\n")
    108. cat("Error Rate:", error_rate_valid, "\n")
    109. cat("Sensitivity:", sen_valid, "\n")
    110. cat("Specificity:", sep_valid, "\n")
    111. cat("Precision:", precision_valid, "\n")
    112. cat("F1 Score:", F1_valid, "\n")
    113. cat("MCC:", MCC_valid, "\n")
    114. cat("AUC:", auc_valid, "\n")

    结果输出:

    以上是找到的相对最优参数组合,看看具体性能:

    还不让入默认的性能好呢。

    看看GPT给的参数的取值建议,祝各位调得开心:

    iter (迭代次数): 这个参数通常设置在10到1000之间。较小的数据集可能需要较少的迭代,而较大或较复杂的数据集可能需要更多的迭代。通常开始可以尝试50, 100, 200等值,然后根据模型的性能来调整。

    maxdepth (树的最大深度): 这个参数一般设置在1到10之间。深度为1意味着使用决策树桩(仅一个决策点),这有助于防止过拟合,是AdaBoost中常用的设置。但对于更复杂的数据模式,可能需要更深的树。可以尝试的值包括1, 2, 3, 5等。

    nu (学习率): 学习率的典型取值范围是0.01到1。较小的学习率(如0.01, 0.1)可以使模型学习得更稳健,但收敛速度可能较慢,需要更多的迭代次数。较高的学习率可以加快学习速度,但可能导致模型在训练过程中不稳定。

    四、最后

    数据嘛:

    链接:https://pan.baidu.com/s/1rEf6JZyzA1ia5exoq5OF7g?pwd=x8xm

    提取码:x8xm

  • 相关阅读:
    音视频实战---音视频解码
    迁移学习——ResNet152
    Listen、Attention、Spell模型
    Hive企业级调优
    关于软件<PDF文档管理系统V1.0>的介绍
    给大龄准备转行网络工程师的朋友一些建议
    SpringBoot整合knife4j
    后端统一处理返回前端日期LocalDateTime格式化去T,Long返回前端损失精度问题
    C进阶-自定义类型:结构体、枚举、联合
    【马士兵】Python基础--04
  • 原文地址:https://blog.csdn.net/qq_30452897/article/details/140368470