ssenguptaopteamix commited on
Commit
9e6d02d
·
verified ·
1 Parent(s): 2fb76ec

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +61 -0
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
2
+ from datasets import load_dataset
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
5
+ model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
6
+
7
+ tokenizer.pad_token = tokenizer.eos_token
8
+
9
+ dataset = load_dataset("HuggingFaceH4/ultrachat_200k")
10
+ dataset = dataset['train_sft'].select(range(100))
11
+
12
+ def tokenize_function(examples):
13
+ return tokenizer(examples["prompt"], padding="max_length", truncation=True)
14
+
15
+ td = dataset.map(tokenize_function, batched=True)
16
+
17
+ training_args = TrainingArguments(
18
+ output_dir="./output",
19
+ per_device_train_batch_size=4,
20
+ num_train_epochs=3,
21
+ logging_dir="./logs",
22
+ )
23
+
24
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
25
+
26
+
27
+
28
+ """
29
+
30
+ dataloader_config = DataLoaderConfiguration(
31
+ dispatch_batches=None,
32
+ split_batches=False,
33
+ even_batches=True,
34
+ use_seedable_sampler=True
35
+ )
36
+
37
+
38
+ accelerator = Accelerator(dataloader_config=dataloader_config)
39
+
40
+ with accelerator.prepare():
41
+ trainer = Trainer(
42
+ model=model,
43
+ args=training_args,
44
+ data_collator=data_collator,
45
+ train_dataset=td,
46
+ )
47
+
48
+ trainer.train()
49
+ trainer.save_model("fine_tuned_gpt2")
50
+
51
+ """
52
+
53
+ trainer = Trainer(
54
+ model=model,
55
+ args=training_args,
56
+ data_collator=data_collator,
57
+ train_dataset=td,
58
+ )
59
+
60
+ trainer.train()
61
+ trainer.save_model("fine_tuned_gpt2")