Qwen2-1.5B-Instruct Lora微调

十分钟ll 2024-08-19 09:01:05 阅读 89

Qwen2-1.5B-Instruct微调Lora微调

1. 模型下载2. 准备工作(高手请忽略,没啥用)3. 接下来进入正题吧(导包)4. 加载数据5. 数据预处理6. 创建模型7. 配置训练参数8. 创建训练器9. 开始训练!!!10. 完整的.py代码11. 合并Lora推理预测代码

最近做了一个基于Qwen2-1.5B-Instruct模型的比赛,记录一下自己的微调过程。怕自己以后忘了我就手把手一步一步来记录了。

大多数都是给小白看的,如果你是小白建议你用jupyter运行,按照我这个模块一块一块运行,如果你是高手单纯的想找一个训练代码直接看模块10,我在提供了完整代码。

1. 模型下载

一般模型尽量在modelscope上先搜一下,比较这个下载速度真的快。

<code>import torch

from modelscope import snapshot_download, AutoModel, AutoTokenizer

import os

# 第一个参数表示下载模型的型号,第二个参数是下载后存放的缓存地址,第三个表示版本号,默认 master

model_dir = snapshot_download('Qwen/Qwen2-1.5B-Instruct', cache_dir='./', revision='master')code>

2. 准备工作(高手请忽略,没啥用)

微调的主要工作其实就是数据处理,其他基本就是个架往里放就行。

接下来是一份官网给出的推理的代码,借助这个代码我们来看输入模型的数据格式长什么样。

from modelscope import AutoModelForCausalLM, AutoTokenizer

device = "cuda" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(

"./Qwen2-1.5B-Instruct",

torch_dtype="auto",code>

device_map="auto"code>

)

tokenizer = AutoTokenizer.from_pretrained("qwen/Qwen2-1.5B-Instruct")

prompt = "你好"

messages = [{ "role": "system", "content": '你是医疗问答助手章鱼哥,你将帮助用户解答基础的医疗问题。'},

{ "role": "user", "content": prompt}

]

text = tokenizer.apply_chat_template(

messages,

tokenize=False,

add_generation_prompt=True

)

model_inputs = tokenizer([text], return_tensors="pt").to(device)code>

generated_ids = model.generate(

model_inputs.input_ids,

max_new_tokens=512

)

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

print(response)

可以打印看看编码后的输入数据长什么样:

'<|im_start|>system\n你是医疗问答助手章鱼哥,你将帮助用户解答基础的医疗问题。<|im_end|>\n<|im_start|>user\n你好<|im_end|>\n<|im_start|>assistant\n'

这里可以看到其实apply_chat_template方法在对函数编码的时候没有给出mask等内容(他这个和智谱轻言的GLM的apply_chat_template就差距很大,在这卡了我半天)所以在数据处理的时候就不能直接用他这个模板。

3. 接下来进入正题吧(导包)

from datasets import Dataset, load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

4. 加载数据

我这里是用了一个医疗问答的csv数据,能了解到这里的应该数据处理不需要细说了吧

dataset = load_dataset("csv", data_files="./问答.csv", split="train")code>

dataset = dataset.filter(lambda x: x["answer"] is not None)

datasets = dataset.train_test_split(test_size=0.1)

5. 数据预处理

tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)

def process_func(example):

MAX_LENGTH = 768 # 最大输入长度,根据你的显存和数据自己调整

input_ids, attention_mask, labels = [], [], []

instruction = example["question"].strip() # query

# instruction = tokenizer.apply_chat_template([{"role": "user", "content": instruction}],

# add_generation_prompt=True,

# tokenize=True,

# ) # '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nquery<|im_end|>\n<|im_start|>assistant\n'

instruction = tokenizer(

f"<|im_start|>system\n你是医学领域的人工助手章鱼哥<|im_end|>\n<|im_start|>user\n{ example['question']}<|im_end|>\n<|im_start|>assistant\n",

add_special_tokens=False,

)

response = tokenizer(f"{ example['answer']}", add_special_tokens=False) # \n response, 缺少eos token

input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]

attention_mask = (instruction["attention_mask"] + response["attention_mask"] + [1])

labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]

if len(input_ids) > MAX_LENGTH:

input_ids = input_ids[:MAX_LENGTH]

attention_mask = attention_mask[:MAX_LENGTH]

labels = labels[:MAX_LENGTH]

return {

"input_ids": input_ids,

"attention_mask": attention_mask,

"labels": labels

}

tokenized_ds = datasets['train'].map(process_func, remove_columns=['id', 'question', 'answer'])

tokenized_ts = datasets['test'].map(process_func, remove_columns=['id', 'question', 'answer'])

6. 创建模型

import torch

model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)

from peft import LoraConfig, TaskType, get_peft_model, PeftModel

config = LoraConfig(target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], modules_to_save=["post_attention_layernorm"]) # 配置Lora参数

model = get_peft_model(model, config) # 创建Lora模型

7. 配置训练参数

args = TrainingArguments(

output_dir="./chatbot",code>

per_device_train_batch_size=4,

gradient_accumulation_steps=8,

gradient_checkpointing=True,

logging_steps=300,

num_train_epochs=10,

learning_rate=1e-4,

remove_unused_columns=False,

save_strategy="epoch"code>

) # 在这里如果你开起了梯度检查点gradient_checkpointing=True,就必须加上model.enable_input_require_grads(),否则会报一个很难受的错误

model.enable_input_require_grads()

8. 创建训练器

trainer = Trainer(

model=model,

args=args,

train_dataset=tokenized_ds.select(range(5000)), # 我这个数据量很大,我随机抽取5000条训练不然太慢了

data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),

)

9. 开始训练!!!

祝你成功

trainer.train()

10. 完整的.py代码

import torch

from datasets import Dataset, load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

from peft import LoraConfig, TaskType, get_peft_model, PeftModel

dataset = load_dataset("csv", data_files="./问答.csv", split="train")code>

dataset = dataset.filter(lambda x: x["answer"] is not None)

datasets = dataset.train_test_split(test_size=0.1)

tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)

def process_func(example):

MAX_LENGTH = 768

input_ids, attention_mask, labels = [], [], []

instruction = example["question"].strip() # query

instruction = tokenizer(

f"<|im_start|>system\n你是医学领域的人工助手章鱼哥<|im_end|>\n<|im_start|>user\n{ example['question']}<|im_end|>\n<|im_start|>assistant\n",

add_special_tokens=False,

)

response = tokenizer(f"{ example['answer']}", add_special_tokens=False) # \n response, 缺少eos token

input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]

attention_mask = (instruction["attention_mask"] + response["attention_mask"] + [1])

labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]

if len(input_ids) > MAX_LENGTH:

input_ids = input_ids[:MAX_LENGTH]

attention_mask = attention_mask[:MAX_LENGTH]

labels = labels[:MAX_LENGTH]

return {

"input_ids": input_ids,

"attention_mask": attention_mask,

"labels": labels

}

tokenized_ds = datasets['train'].map(process_func, remove_columns=['id', 'question', 'answer'])

tokenized_ts = datasets['test'].map(process_func, remove_columns=['id', 'question', 'answer'])

model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct", trust_remote_code=True)

config = LoraConfig(target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], modules_to_save=["post_attention_layernorm"])

model = get_peft_model(model, config)

args = TrainingArguments(

output_dir="./law",code>

per_device_train_batch_size=4,

gradient_accumulation_steps=16,

gradient_checkpointing=True,

logging_steps=6,

num_train_epochs=10,

learning_rate=1e-4,

remove_unused_columns=False,

save_strategy="epoch"code>

)

model.enable_input_require_grads()

trainer = Trainer(

model=model,

args=args,

train_dataset=tokenized_ds.select(range(400)),

data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),

)

trainer.train()

11. 合并Lora推理预测代码

import torch

from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import PeftModel

def predict(messages, model, tokenizer):

device = "cuda"

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

model_inputs = tokenizer([text], return_tensors="pt").to(device)code>

generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512)

generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

return response

# 加载原下载路径的tokenizer和model

tokenizer = AutoTokenizer.from_pretrained("./Qwen2-1.5B-Instruct/", use_fast=False, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained("./Qwen2-1.5B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16)code>

# 加载训练好的Lora模型,将下面的checkpointXXX替换为实际的checkpoint文件名名称

model = PeftModel.from_pretrained(model, model_id="./chatbot/checkpoint-1560")code>

test_texts = {

'instruction': "你是医学领域的人工助手章鱼哥",

'input': "嗓子疼,是不是得了流感了"

}

instruction = test_texts['instruction']

input_value = test_texts['input']

messages = [

{ "role": "system", "content": f"{ instruction}"},

{ "role": "user", "content": f"{ input_value}"}

]

response = predict(messages, model, tokenizer)

print(response)



声明

本文内容仅代表作者观点,或转载于其他网站,本站不以此文作为商业用途
如有涉及侵权,请联系本站进行删除
转载本站原创文章,请注明来源及作者。