generated from TaqiyEddine-B/app-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
125 lines (95 loc) · 4.72 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""Fine-tune a pretrained model on a dataset."""
import datetime
import os
import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import AutoTokenizer, LlamaForCausalLM, TrainingArguments
from trl import SFTTrainer
from zoneinfo import ZoneInfo
from config import FinetuneConfig
from utils import setup_mlflow_tracking
class LlmTrainer:
def __init__(self, model_name: str, dataset_name: str):
"""Initialize the LlmTrainer class."""
self.config = FinetuneConfig()
self.model_name = model_name
self.dataset_name = dataset_name
self.model = LlamaForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.bfloat16,
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.tokenizer.pad_token = self.tokenizer.eos_token
self.trainer = None
def load_preprocess_dataset(self, dataset_name: str, percentage: float = 1.0):
"""Load and preprocess the dataset."""
dataset = load_dataset(dataset_name, split="train")
print(f"Loading original dataset: {len(dataset)} samples")
# Calculate the number of samples to select based on the percentage
sample_size = int(len(dataset) * percentage)
self.dataset = dataset.select(range(sample_size))
print(f"Loaded dataset: {len(self.dataset)} samples")
print("\tPreprocessing the dataset...")
def merge_sample(example):
"""Merge the instruction and input into a single conversation."""
example["conversation"] = f"""<|begin_of_text|><|start_header_id|>You are Question/answer assistant. Your answer must begin with capital letter and end with full stop.<|end_header_id|>
Customer: {example['instruction']}\nAssistant: {example['response']}""" # noqa: E501
return example
self.dataset = self.dataset.map(merge_sample)
def configure_trainer(self, experiment_name: str):
"""Configure the trainer with the specified configurations."""
self.training_args = TrainingArguments(
max_steps=self.config.max_steps,
per_device_train_batch_size=self.config.per_device_train_batch_size,
learning_rate=self.config.learning_rate,
gradient_accumulation_steps=self.config.gradient_accumulation_steps,
warmup_steps=self.config.warmup_steps,
optim=self.config.optim,
seed=self.config.seed,
save_strategy=self.config.save_strategy,
output_dir=f"output/{experiment_name}",
logging_steps=self.config.logging_steps,
gradient_checkpointing=self.config.gradient_checkpointing,
report_to=["mlflow"],
)
# LoRA configuration for model adaptation
self.lora_config = LoraConfig(r=self.config.r)
def train_model(self, experiment_name: str = "llama_experiment"):
"""Train the model within an mlflow experiment."""
# Initialize the trainer with configurations
self.trainer = SFTTrainer(
model=self.model,
tokenizer=self.tokenizer,
args=self.training_args,
peft_config=self.lora_config,
train_dataset=self.dataset,
dataset_text_field="conversation",
)
print("Starting model fine-tuning...")
with setup_mlflow_tracking(self.model_name) as _:
self.trainer.train()
print("Training completed.")
def save_model(self, experiment_name: str):
# Save the trained model to the specified path
output_dir = "output/"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
save_path = os.path.join(output_dir, experiment_name)
model_to_save = self.trainer.model.module if hasattr(self.trainer.model, "module") else self.trainer.model
model_to_save.save_pretrained(save_path)
print(f"Model saved to {save_path}")
def train_pipeline(self):
"""The main training pipeline."""
model_short_name = model_name.split("/")[-1]
timestamp = datetime.datetime.now(tz=ZoneInfo("UTC")).strftime("%Y%m%d_%H%M")
experiment_name = f"{model_short_name}_finetune_{timestamp}"
self.load_preprocess_dataset(dataset_name=self.dataset_name)
self.configure_trainer(experiment_name=experiment_name)
self.train_model(experiment_name=experiment_name)
self.save_model(experiment_name=experiment_name)
if __name__ == "__main__":
"""Main entry point for the script."""
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"
LlmTrainer(model_name=model_name, dataset_name=dataset_name).train_pipeline()