• Gemma


    1.使用

    首先是去HF下载模型,但一直下载不了,所以去了HF镜像网站,下载gemma需要HF的Token,按照步骤就可以下载。代码主要是Kaggle论坛里面的分享内容

    huggingface-cli download --token hf_XXX --resume-download google/gemma-7b --local-dir gemma-7b-mirror
    
    • 1

    这里我有时是2b有时是7b,换着用。

    from transformers import AutoTokenizer, AutoModelForCausalLM  
    tokenizer = AutoTokenizer.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
    Gemma = AutoModelForCausalLM.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
    def answer_the_question(question):
        input_ids = tokenizer(question, return_tensors="pt")
        generated_text = Gemma.generate(**input_ids,max_length=256)
        answer = tokenizer.decode(generated_text[0], skip_special_tokens=True)
        return answer
    question = "给我写一首优美的诗歌?"
    answer = answer_the_question(question)
    print(answer)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    2.RAG

    参考

    from langchain_community.document_loaders import PyPDFLoader
    from langchain.text_splitter import CharacterTextSplitter
    from langchain_community.embeddings import HuggingFaceEmbeddings
    from langchain_community.vectorstores import FAISS
    ##2.1 根据question检索sentence chunk
    import os
    def get_all_pdfs(directory):
        pdf_files = []
        for root, dirs, files in os.walk(directory):
            for file in files:
                if file.endswith(".pdf"):
                    pdf_files.append(os.path.join(root, file))
        return pdf_files
    
    
    class RAG:
        def __init__(self, num_retrieved_docs=5, pdf_folder_path='D:/Gemma/PDF'):
            pdf_files = get_all_pdfs(pdf_folder_path)
            print("Documents used", pdf_files)
            loaders = [PyPDFLoader(pdf_file) for pdf_file in pdf_files]
            all_documents = []
            for loader in loaders:
                raw_documents = loader.load()
                text_splitter = CharacterTextSplitter(
                    separator="\n\n",
                    chunk_size=10,
                    chunk_overlap=1,
                    # length_function=len,
                )
                documents = text_splitter.split_documents(raw_documents)
                all_documents.extend(documents)
            embeddings = HuggingFaceEmbeddings(model_name="D:/Projects/model/m3e-base")    
            self.db = FAISS.from_documents(all_documents, embeddings)
            self.retriever = self.db.as_retriever(search_kwargs={"k": num_retrieved_docs})
    
        def search(self, query):
            docs = self.retriever.get_relevant_documents(query)
            return docs
    retriever = RAG()
    ##2.2根据sentence chunk和question去回答
    class Assistant:
        def __init__(self):
            self.tokenizer = AutoTokenizer.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
            self.Gemma = AutoModelForCausalLM.from_pretrained("D:/Gemma/gemma-2b-int-mirror2")
    
        def create_prompt(self, query, retrieved_info):
            prompt = f"""你是人工智能助手,需要根据Relevant information里面的相关内容回答用户的Instruction,其中相关信息如下:
            Instruction: {query}
            Relevant information: {retrieved_info}
            Output:
            """
            print(prompt)
            return prompt
        
        def reply(self, query, retrieved_info):
            prompt = self.create_prompt(query, retrieved_info)
            input_ids = self.tokenizer(query, return_tensors="pt").input_ids
            # Generate text with a focus on factual responses
            generated_text = self.Gemma.generate(
                input_ids,
                do_sample=True,
                max_length=500,
                temperature=0.7, # Adjust temperature according to the task, for code generation it can be 0.9
                
            )
            # Decode and return the answer
            answer = self.tokenizer.decode(generated_text[0], skip_special_tokens=True)
            return answer
    chatbot = Assistant()
    ## 2.3开始使用RAG
    def generate_reply(query):
        related_docs = retriever.search(query)
        #print('related docs', related_docs)
        reply = chatbot.reply(query, related_docs)
        return reply
    reply = generate_reply("存在的不足及后续的优化工作")
    for s in reply.split('\n'):
        print(s)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78

    3.LoRA

    3.1LoRA分类任务

    参考
    使用nlp-getting-started数据集训练模型做二分类任务。首先拿到源model

    from datasets import load_dataset
    from transformers import AutoTokenizer,AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments,pipeline
    from peft import prepare_model_for_int8_training,LoraConfig, TaskType, get_peft_model
    import numpy as np
    NUM_CLASSES = 2#模型输出分类的类别数
    BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 8,5,64,32,0.1#LoRA训练的参数
    MODEL_PATH="D:/Gemma/gemma-2b-int-mirror2"#模型地址
    # 1.源model,设置输出二分类
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,num_labels=NUM_CLASSES)
    print(model)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    处理csv数据,将输入文字经过tokenizer编码处理

    #2.处理dataset,输入过长进行truncation(tokenizer处理后)
    dataset = load_dataset('csv', data_files='D:/Gemma/nlp-getting-started/train.csv')
    dataset['test'] = dataset['train']
    dataset = dataset.remove_columns(['id', 'keyword', 'location'])
    dataset = dataset.rename_column("target", "label")#csv最后只保留了text列和label列
    tokenized_dataset = {}#train和test
    for split in dataset.keys():
        tokenized_dataset[split] = dataset[split].map(lambda x: tokenizer(x["text"], truncation=True), batched=True)
    print(tokenized_dataset["train"])
    print(tokenized_dataset["train"][1])
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    在源model基础上配置LoRA的参数,形成lora_model

    #3.LoRA模型参数设置
    model = prepare_model_for_int8_training(model)
    lora_config = LoraConfig(
        r=R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        task_type=TaskType.SEQ_CLS,#SEQ_CLS:序列分类任务;TOKEN_CLS命名实体识别;SEQ2SEQ机器翻译;LM语言建模任务
        target_modules='all-linear'#all-linear所有线性层;embeddings嵌入层;convs卷积层
    )
    lora_model = get_peft_model(model, lora_config)
    print(lora_model)
    print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    配置lora_model的训练参数

    #4.LoRA训练参数设置(损失计算等)
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return {"accuracy": (predictions == labels).mean()}
    
    trainer = Trainer(
        model=lora_model,
        args=TrainingArguments(
            output_dir="./LoAR_data/",
            learning_rate=2e-5,
            per_device_train_batch_size=BATCH_SIZE,
            per_device_eval_batch_size=BATCH_SIZE,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=EPOCHS,
            weight_decay=0.01,
            load_best_model_at_end=True,
            logging_steps=10,
            report_to="none"
        ),
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27

    开始训练并保存使用模型

    #5.训练并评估
    print("Evaluating the Model Before Training!")
    trainer.evaluate()
    print("Training the Model")
    trainer.train()
    print("Evaluating the trained model")
    trainer.evaluate()
    #6.保存并使用
    lora_model.save_pretrained('fine-tuned-model')
    clf = pipeline("text-classification", lora_model, tokenizer=MODEL_PATH)#LoRA训练后的模型
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    3.2LoRA中文建模任务

    参考
    首先拿到源model和config

    from transformers import AutoConfig,AutoTokenizer,AutoModelForCausalLM, Trainer, TrainingArguments
    from peft import LoraConfig, get_peft_model,prepare_model_for_kbit_training,PeftModel
    import torch
    import  datasets
    from tqdm import tqdm
    import json
    BATCH_SIZE,EPOCHS,R,LORA_ALPHA,LORA_DROPOUT = 8,5,64,32,0.1#LoRA训练的参数
    MODEL_PATH="D:/Gemma/gemma-2b-int-mirror2"#模型地址
    device = torch.device('cuda:0')
    # 1.源model和model的config
    config = AutoConfig.from_pretrained(MODEL_PATH, trust_remote_code=True)
    config.is_causal = True  #确保模型在生成文本时只能看到左侧的上下文
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(MODEL_PATH,device_map="auto", config=config,trust_remote_code=True)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14

    根据模型和config处理json数据

    #2.根据model的config处理dataset(tokenizer处理后),并保存加载
    def preprocess(tokenizer: PreTrainedTokenizer, config, file_path, max_seq_length, prompt_key, target_key, skip_overlength=False):  
        # 数据预处理  
        pad_token_id = tokenizer.pad_token_id  # 获取填充标记的ID  
          
        with open(file_path, "r", encoding="utf8") as f:  
            for line in tqdm(f.readlines()):  
                example = json.loads(line)  
                prompt_ids = tokenizer.encode(example[prompt_key], max_length=max_seq_length, truncation=True)  
                target_ids = tokenizer.encode(example[target_key], max_length=max_seq_length, truncation=True)  
                  
                # 检查prompt和target连接后是否超出最大长度,并在需要时跳过  
                total_length = len(prompt_ids) + len(target_ids) + (1 if config.eos_token_id is not None else 0)  
                if skip_overlength and total_length > max_seq_length:  
                    continue  
                  
                # 连接prompt和target,并添加EOS标记(如果提供)  
                input_ids = prompt_ids + target_ids  
                if config.eos_token_id is not None:  
                    input_ids.append(config.eos_token_id)  
                  
                # 截断序列到最大长度  
                input_ids = input_ids[:max_seq_length]  
                  
                # 填充序列到最大长度  
                input_ids.extend([pad_token_id] * (max_seq_length - len(input_ids)))  
                  
                assert len(input_ids) == max_seq_length, "序列长度必须等于max_seq_length"  
                  
                yield {  
                    "input_ids": input_ids,  
                    "seq_len": len(prompt_ids)  # 注意:这里提供的seq_len是原始prompt的长度,不包括填充  
                }
    dataset = datasets.Dataset.from_generator(lambda: preprocess(tokenizer, 
                                                config, 
                                                "D:/Gemma/try/hc3_chatgpt_zh_specific_qa.json", 
                                                max_seq_length=2000, 
                                                prompt_key="q",
                                                target_key="a",))
    
    dataset.save_to_disk("h3c-chinese")  # 保存处理后的数据集
    train_set = datasets.load_from_disk("h3c-chinese")#加载处理后的数据集
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42

    配置Lora参数

    #3.LoRA模型参数设置
    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(
        r=R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        task_type="CAUSAL_LM",
        target_modules='all-linear'
    )
    lora_model = get_peft_model(model, lora_config)
    print(lora_model)
    print(lora_model.print_trainable_parameters())#LoRA模型要训练的参数
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    配置lora的训练参数,包括损失计算compute_metrics,并对输入的input_ids构造输入样本列表批次处理。

    trainer = Trainer(
        model=lora_model,
        args=TrainingArguments(
            output_dir="./LoAR_data2/",
            learning_rate=2e-5,
            per_device_train_batch_size=BATCH_SIZE,
            save_strategy="epoch",
            num_train_epochs=EPOCHS,
            weight_decay=0.01,
            logging_steps=10,
            report_to="none"
        ),
        train_dataset=train_set,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    #     compute_metrics=compute_metrics
    )
    trainer.train()
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
  • 相关阅读:
    Python面向对象 —— 类属性「二」(属性获取机制和陷阱、代码示例、执行流程分析)
    php代码审计
    Vue 03 数据绑定
    python-运算符
    Vue+iview 组件中通过v-for循环动态生成form表单进行表单校验
    云计算拼的是运维吗
    MySQL高可用九种方案
    Mysqld之MHA高可用
    艾美捷ProSci 激活素RIB / ACVR1B重组蛋白方案
    JavaScript IndexedDB 完整指南
  • 原文地址:https://blog.csdn.net/weixin_38226321/article/details/136290742