• 基于TNEWS‘ 今日头条中文新闻(短文本)分类


    数据部分

    内容

    TNEWS’今日头条中文新闻数据集来自今日头条的新闻版块,共提取了15个类别的新闻,包括旅游,教育,金融,军事等。数据量:训练集(53,360),验证集(10,000),测试集(10,000)

    在这里插入图片描述

    数据处理

    将文本数据转换成id

    # 转换成id的函数
    def convert_example(example, tokenizer):
        encoded_inputs = tokenizer(text=example["sentence"], max_seq_len=128, pad_to_max_seq_len=True)
        return tuple([np.array(x, dtype="int64") for x in [
                encoded_inputs["input_ids"], encoded_inputs["token_type_ids"], [example["label"]]]])
                
    # 把训练集合转换成id
    train_ds = train_ds.map(partial(convert_example, tokenizer=tokenizer))
    # 把验证集合转换成id
    dev_ds = dev_ds.map(partial(convert_example, tokenizer=tokenizer))
    # 构建训练集合的dataloader
    train_batch_size=32
    dev_batch_size=32
    train_batch_sampler = paddle.io.DistributedBatchSampler(dataset=train_ds, batch_size=train_batch_size, shuffle=True)
    train_data_loader = paddle.io.DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, return_list=True)
    
    # 针对验证集数据加载,我们使用单卡进行评估,所以采用 paddle.io.BatchSampler 即可
    # 定义验证集的dataloader
    dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=dev_batch_size, shuffle=False)
    
    dev_data_loader = paddle.io.DataLoader(
            dataset=dev_ds,
            batch_sampler=dev_batch_sampler,
            return_list=True)    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24

    模型构建

    class ShortTextClassification(nn.Layer):
        def __init__(self, pretrained_model,num_class,dropout=None):
            super().__init__()
            self.ptm = pretrained_model
            self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
    
            # num_labels = 2 (similar or dissimilar)
            self.classifier = nn.Linear(self.ptm.config["hidden_size"], num_class)
    
        def forward(self,
                    input_ids,
                    token_type_ids=None,
                    position_ids=None,
                    attention_mask=None):
    
            _, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids,
                                        attention_mask)
    
            cls_embedding = self.dropout(cls_embedding)
            logits = self.classifier(cls_embedding)
    
            return logits
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    model = ShortTextClassification(pretrained_model,num_class=len(train_ds.label_list))
    
    • 1

    训练配置

    epochs = 3
    num_training_steps = len(train_data_loader) * epochs
    
    # 定义 learning_rate_scheduler,负责在训练过程中对 lr 进行调度
    lr_scheduler = LinearDecayWithWarmup(2E-5, num_training_steps, 0.0)
    
    # 训练结束后,存储模型参数
    save_dir ="checkpoint"
    # 创建保存的文件夹
    os.makedirs(save_dir,exist_ok=True)
    
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    
    # 定义 Optimizer
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=0.0,
        apply_decay_param_fun=lambda x: x in decay_params)
    # 交叉熵损失
    criterion = paddle.nn.loss.CrossEntropyLoss()
    # 评估的时候采用准确率指标
    metric = paddle.metric.Accuracy()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28

    模型训练和预测

    定义评估函数

    # 因为训练过程中同时要在验证集进行模型评估,因此我们先定义评估函数
    @paddle.no_grad()
    def evaluate(model, criterion, metric, data_loader, phase="dev"):
        model.eval()
        metric.reset()
        losses = []
        for batch in data_loader:
            input_ids, token_type_ids, labels = batch
            probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
            # 计算损失
            loss = criterion(probs, labels)
            losses.append(loss.numpy())
            # 计算准确率
            correct = metric.compute(probs, labels)
            #准确率更新
            metric.update(correct)
            accu = metric.accumulate()
        print("eval {} loss: {:.5}, accu: {:.5}".format(phase,
                                                        np.mean(losses), accu))
        model.train()
        metric.reset()
        return np.mean(losses),accu
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22

    训练

    def do_train(model, criterion, metric, dev_data_loader,train_data_loader):
        
        global_step = 0
        tic_train = time.time()
        best_accuracy=0.0
    
        for epoch in range(1, epochs + 1):
            for step, batch in enumerate(train_data_loader, start=1):
    
                input_ids, token_type_ids, labels = batch
                probs = model(input_ids=input_ids, token_type_ids=token_type_ids)
                loss = criterion(probs, labels)
                correct = metric.compute(probs, labels)
                metric.update(correct)
                acc = metric.accumulate()
    
                global_step += 1 
                
                # 每间隔 100 step 输出训练指标
                if global_step % 100 == 0:
                    print(
                        "global step %d, epoch: %d, batch: %d, loss: %.5f, accu: %.5f, speed: %.2f step/s"
                        % (global_step, epoch, step, loss, acc,
                            10 / (time.time() - tic_train)))
                    tic_train = time.time()
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()
    
                # 每间隔 100 step 在验证集和测试集上进行评估
                if global_step % 500 == 0:
                    eval_loss,eval_accu=evaluate(model, criterion, metric, dev_data_loader, "dev")
                    if(best_accuracy<eval_accu):
                        best_accuracy=eval_accu
                        # 保存模型
                        save_param_path = os.path.join(save_dir, 'model_best.pdparams')
                        paddle.save(model.state_dict(), save_param_path)
                        # 保存tokenizer
                        tokenizer.save_pretrained(save_dir)
    
    do_train(model, criterion, metric, dev_data_loader,train_data_loader) 
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42

    预测

    state_dict=paddle.load('checkpoint/model_best.pdparams')
    model.load_dict(state_dict)
    
    • 1
    • 2
    # 测试集可以选择 test,test1.0两个
    test_ds = load_dataset('clue', task_name, splits=['test1.0'])
    
    • 1
    • 2
    def do_predict(model,example):
        # 把文本转换成input_ids,token_type_ids
        # example=test_ds[0]
        encoded_text = tokenizer(text=example["sentence"], max_seq_len=512, pad_to_max_seq_len=True)
        # 把input_ids变成paddle tensor
        input_ids = paddle.to_tensor([encoded_text['input_ids']])
        # 把token_type_ids变成paddle tensor
        segment_ids = paddle.to_tensor([encoded_text['token_type_ids']])
        # 模型预测
        pooled_output = model(input_ids, segment_ids)
        # 取概率值最大的索引
        out2 = paddle.argmax(pooled_output, axis=1)fgh
        # print('预测的label标签为 {}'.format(out2.numpy()[0]))
        # print('真实的label标签为 {}'.format(test_ds[0]['label']))
        return out2.numpy()[0]
    
    predict_label=[]
    for i in tqdm(range(len(test_ds))):
        example=test_ds[i]
        label_pred=do_predict(model,example)
        predict_label.append(label_pred)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    output_submit_file = "tnews10_predict.json"
    label_map = {i: label for i, label in enumerate(train_ds.label_list)}
    # 保存标签结果
    with open(output_submit_file, "w") as writer:
        for i, pred in enumerate(predict_label):
            json_d = {}
            json_d['id'] = i
            json_d['label'] = str(label_map[pred])
            writer.write(json.dumps(json_d) + '\n')
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
  • 相关阅读:
    【MacOs系统-M2安装2022新版AWVS渗透工具】-保姆级安装教程
    五、《图解HTTP》报文首部和HTTP缓存
    潇洒郎: Airtest之Android终端自动化测试
    基于node.js+Vue在线电子商务购物商城系统 Element
    创建自己的cli
    椭圆曲线点加的应用计算
    【21天学习挑战赛】算法——快速排序
    CentOS系统如何做爬虫
    动态规划背包问题之多重背包详解
    记录:2022-9-28 岛屿的最大面积 字母异位词分组 雪花算法实现 偏向锁 |锁升级 阻塞线程的方式
  • 原文地址:https://blog.csdn.net/qq_53817374/article/details/126011835