Dipanshu-01's picture
Upload 3 files
e279ec4 verified
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
# Load dataset
data = load_dataset("json", data_files="expanded_dataset_570.json", split='train')
data = data.train_test_split(test_size=0.1, seed=42)
# Load model/tokenizer
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# Format
def format(example):
prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n"
inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=512)
targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=128)
inputs["labels"] = targets["input_ids"]
return inputs
tokenized_data = data.map(format)
# Training config
args = TrainingArguments(
output_dir="./results",
per_device_train_batch_size=2,
learning_rate=2e-5,
num_train_epochs=5,
warmup_steps=50,
lr_scheduler_type="linear",
logging_steps=10,
evaluation_strategy="steps",
eval_steps=100,
save_total_limit=2,
report_to="none"
)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
data_collator=collator,
)
trainer.train()
# Push to Hugging Face Hub
model.push_to_hub("your-username/flan-t5-chatbot")
tokenizer.push_to_hub("your-username/flan-t5-chatbot")