luohoa97 commited on
Commit
ca7562b
·
verified ·
1 Parent(s): b9e7099

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from transformers import BertForSequenceClassification, Trainer, TrainingArguments, BertTokenizer
4
+ from datasets import Dataset
5
+ from huggingface_hub import HfApi
6
+
7
+ class CSVTrainer:
8
+ def __init__(self):
9
+ self.csv_files = []
10
+ self.model_dir = "./Personal"
11
+ self.repo_id = "luohoa97/PersonalBot-o"
12
+
13
+ def upload_and_train(self, csv_files):
14
+ if not csv_files:
15
+ return "Please upload at least one CSV file."
16
+
17
+ dataframes = [pd.read_csv(file.name) for file in csv_files]
18
+ combined_df = pd.concat(dataframes, ignore_index=True)
19
+
20
+ combined_df['text'] = combined_df.apply(lambda row: f"{row['Event']} in {row['Location']} on {row['Date']}, {row['Time']}", axis=1)
21
+ combined_df['labels'] = pd.factorize(combined_df['Category'])[0]
22
+
23
+ dataset = Dataset.from_pandas(combined_df[['text', 'labels']])
24
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
25
+
26
+ def tokenize_function(examples):
27
+ return tokenizer(examples['text'], padding='max_length', truncation=True)
28
+
29
+ tokenized_datasets = dataset.map(tokenize_function, batched=True)
30
+ tokenized_datasets.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])
31
+
32
+ model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(combined_df['labels'].unique()))
33
+
34
+ training_args = TrainingArguments(
35
+ output_dir='./results',
36
+ num_train_epochs=3,
37
+ per_device_train_batch_size=16,
38
+ per_device_eval_batch_size=64,
39
+ warmup_steps=500,
40
+ weight_decay=0.01,
41
+ logging_dir='./logs',
42
+ logging_steps=10,
43
+ evaluation_strategy="epoch",
44
+ save_strategy="epoch"
45
+ )
46
+
47
+ trainer = Trainer(
48
+ model=model,
49
+ args=training_args,
50
+ train_dataset=tokenized_datasets,
51
+ eval_dataset=tokenized_datasets
52
+ )
53
+
54
+ trainer.train()
55
+ model.save_pretrained(self.model_dir)
56
+ self.upload_to_huggingface()
57
+
58
+ return "Model uploaded to Hugging Face successfully!"
59
+
60
+ def upload_to_huggingface(self):
61
+ api = HfApi()
62
+ try:
63
+ api.create_repo(repo_id=self.repo_id)
64
+ except Exception as e:
65
+ print(f"Repo creation failed: {e}")
66
+ model = BertForSequenceClassification.from_pretrained(self.model_dir)
67
+ model.push_to_hub(self.repo_id)
68
+
69
+ trainer = CSVTrainer()
70
+
71
+ def gradio_interface(file):
72
+ return trainer.upload_and_train(file)
73
+
74
+ iface = gr.Interface(
75
+ fn=gradio_interface,
76
+ inputs=gr.inputs.File(label="Upload CSV Files", type="file", multiple=True),
77
+ outputs="text",
78
+ title="CSV Trainer",
79
+ description="Upload CSV files for training a BERT model."
80
+ )
81
+
82
+ iface.launch()