from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq # Load dataset data = load_dataset("json", data_files="expanded_dataset_570.json", split='train') data = data.train_test_split(test_size=0.1, seed=42) # Load model/tokenizer model_id = "google/flan-t5-base" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForSeq2SeqLM.from_pretrained(model_id) # Format def format(example): prompt = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n" inputs = tokenizer(prompt, padding="max_length", truncation=True, max_length=512) targets = tokenizer(example["output"], padding="max_length", truncation=True, max_length=128) inputs["labels"] = targets["input_ids"] return inputs tokenized_data = data.map(format) # Training config args = TrainingArguments( output_dir="./results", per_device_train_batch_size=2, learning_rate=2e-5, num_train_epochs=5, warmup_steps=50, lr_scheduler_type="linear", logging_steps=10, evaluation_strategy="steps", eval_steps=100, save_total_limit=2, report_to="none" ) collator = DataCollatorForSeq2Seq(tokenizer, model=model) trainer = Trainer( model=model, args=args, train_dataset=tokenized_data["train"], eval_dataset=tokenized_data["test"], data_collator=collator, ) trainer.train() # Push to Hugging Face Hub model.push_to_hub("your-username/flan-t5-chatbot") tokenizer.push_to_hub("your-username/flan-t5-chatbot")