from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling from datasets import load_dataset tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") tokenizer.pad_token = tokenizer.eos_token dataset = load_dataset("HuggingFaceH4/ultrachat_200k") dataset = dataset['train_sft'].select(range(5)) def tokenize_function(examples): return tokenizer(examples["prompt"], padding="max_length", truncation=True) td = dataset.map(tokenize_function, batched=True) training_args = TrainingArguments( output_dir="./output", per_device_train_batch_size=4, num_train_epochs=3, logging_dir="./logs", ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) """ dataloader_config = DataLoaderConfiguration( dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True ) accelerator = Accelerator(dataloader_config=dataloader_config) with accelerator.prepare(): trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=td, ) trainer.train() trainer.save_model("fine_tuned_gpt2") """ trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=td, ) trainer.train() trainer.save_model("fine_tuned_gpt2")