acecalisto3 commited on
Commit
def429d
·
verified ·
1 Parent(s): efc2a65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -15
app.py CHANGED
@@ -1,14 +1,25 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
  import json
5
- import time
6
- import zipfile
7
  import os
 
 
 
 
 
 
 
 
8
  import tempfile
 
9
  import mimetypes
10
  from tqdm import tqdm
 
 
11
 
 
 
 
 
 
12
  def fetch_content(url):
13
  """Fetch content from a given URL."""
14
  try:
@@ -16,7 +27,7 @@ def fetch_content(url):
16
  response.raise_for_status()
17
  return response.text
18
  except requests.RequestException as e:
19
- print(f"Error fetching {url}: {e}")
20
  return None
21
 
22
  def extract_text(html):
@@ -65,7 +76,6 @@ def process_file(file):
65
  "content": content
66
  })
67
  else:
68
- # For non-text files, just store the filename
69
  dataset.append({
70
  "source": "file",
71
  "filename": filename,
@@ -81,7 +91,6 @@ def process_file(file):
81
  "content": content
82
  })
83
  else:
84
- # For non-text files, just store the filename
85
  dataset.append({
86
  "source": "file",
87
  "filename": os.path.basename(file.name),
@@ -106,28 +115,168 @@ def create_dataset(urls, file, text_input):
106
  if text_input:
107
  dataset.extend(process_text(text_input))
108
 
109
- # Save the dataset as JSON
110
  output_file = 'combined_dataset.json'
111
  with open(output_file, 'w') as f:
112
  json.dump(dataset, f, indent=2)
113
 
114
  return output_file
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  # Gradio Interface
117
- def gradio_interface(urls, file, text_input):
118
- return create_dataset(urls, file, text_input)
 
 
 
 
 
 
 
 
 
119
 
120
  iface = gr.Interface(
121
  fn=gradio_interface,
122
  inputs=[
123
  gr.Textbox(lines=5, label="Enter comma-separated URLs"),
124
  gr.File(label="Upload file (including zip files)"),
125
- gr.Textbox(lines=10, label="Enter or paste large text")
 
 
 
126
  ],
127
  outputs=gr.File(label="Download Combined Dataset"),
128
- title="URL, File, and Text to Dataset Converter",
129
- description="Enter URLs, upload files (including zip files), and/or paste text to create a combined dataset for AI training.",
130
  )
131
 
132
  # Launch the interface
133
- iface.launch()
 
 
 
 
 
1
  import json
 
 
2
  import os
3
+ import torch
4
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
+ from sklearn.metrics import accuracy_score
6
+ from torch.utils.data import DataLoader
7
+ from transformers import Trainer, TrainingArguments
8
+ import time
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
  import tempfile
12
+ import zipfile
13
  import mimetypes
14
  from tqdm import tqdm
15
+ import logging
16
+ import gradio as gr
17
 
18
+ # Setup logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # --- URL and File Processing Functions ---
23
  def fetch_content(url):
24
  """Fetch content from a given URL."""
25
  try:
 
27
  response.raise_for_status()
28
  return response.text
29
  except requests.RequestException as e:
30
+ logger.error(f"Error fetching {url}: {e}")
31
  return None
32
 
33
  def extract_text(html):
 
76
  "content": content
77
  })
78
  else:
 
79
  dataset.append({
80
  "source": "file",
81
  "filename": filename,
 
91
  "content": content
92
  })
93
  else:
 
94
  dataset.append({
95
  "source": "file",
96
  "filename": os.path.basename(file.name),
 
115
  if text_input:
116
  dataset.extend(process_text(text_input))
117
 
 
118
  output_file = 'combined_dataset.json'
119
  with open(output_file, 'w') as f:
120
  json.dump(dataset, f, indent=2)
121
 
122
  return output_file
123
 
124
+ # --- Model Training and Evaluation Functions ---
125
+ class CustomDataset(torch.utils.data.Dataset):
126
+ def __init__(self, data, tokenizer, max_length=512):
127
+ self.data = data
128
+ self.tokenizer = tokenizer
129
+ self.max_length = max_length
130
+
131
+ def __len__(self):
132
+ return len(self.data)
133
+
134
+ def __getitem__(self, idx):
135
+ try:
136
+ text = self.data[idx]['content']
137
+ label = self.data[idx].get('label', 0)
138
+
139
+ encoding = self.tokenizer.encode_plus(
140
+ text,
141
+ max_length=self.max_length,
142
+ padding='max_length',
143
+ truncation=True,
144
+ return_attention_mask=True,
145
+ return_tensors='pt',
146
+ )
147
+
148
+ return {
149
+ 'input_ids': encoding['input_ids'].squeeze(),
150
+ 'attention_mask': encoding['attention_mask'].squeeze(),
151
+ 'labels': torch.tensor(label, dtype=torch.long)
152
+ }
153
+ except Exception as e:
154
+ logger.error(f"Error in processing item {idx}: {e}")
155
+ raise
156
+
157
+ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_length=512):
158
+ try:
159
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
160
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
161
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
162
+ model.to(device)
163
+
164
+ dataset = CustomDataset(data, tokenizer, max_length=max_length)
165
+ train_size = int(0.8 * len(dataset))
166
+ val_size = len(dataset) - train_size
167
+ train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
168
+
169
+ training_args = TrainingArguments(
170
+ output_dir='./results',
171
+ num_train_epochs=epochs,
172
+ per_device_train_batch_size=batch_size,
173
+ per_device_eval_batch_size=batch_size,
174
+ evaluation_strategy='epoch',
175
+ learning_rate=learning_rate,
176
+ save_steps=500,
177
+ load_best_model_at_end=True,
178
+ metric_for_best_model='accuracy',
179
+ greater_is_better=True,
180
+ save_total_limit=2,
181
+ seed=42,
182
+ dataloader_num_workers=4,
183
+ fp16=torch.cuda.is_available()
184
+ )
185
+
186
+ trainer = Trainer(
187
+ model=model,
188
+ args=training_args,
189
+ train_dataset=train_dataset,
190
+ eval_dataset=val_dataset,
191
+ compute_metrics=lambda pred: {
192
+ 'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
193
+ }
194
+ )
195
+
196
+ logger.info("Starting model training...")
197
+ start_time = time.time()
198
+ trainer.train()
199
+ end_time = time.time()
200
+ logger.info(f'Training time: {end_time - start_time:.2f} seconds')
201
+
202
+ logger.info("Evaluating model...")
203
+ eval_result = trainer.evaluate()
204
+ logger.info(f'Evaluation result: {eval_result}')
205
+
206
+ trainer.save_model('./model')
207
+
208
+ return model, tokenizer
209
+
210
+ except Exception as e:
211
+ logger.error(f"Error during training: {e}")
212
+ raise
213
+
214
+ def deploy_model(model, tokenizer):
215
+ try:
216
+ model.save_pretrained('./model')
217
+ tokenizer.save_pretrained('./model')
218
+
219
+ deployment_script = f'''
220
+ import torch
221
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
222
+
223
+ model = AutoModelForSequenceClassification.from_pretrained('./model')
224
+ tokenizer = AutoTokenizer.from_pretrained('./model')
225
+
226
+ def predict(text):
227
+ encoding = tokenizer.encode_plus(
228
+ text,
229
+ max_length=512,
230
+ padding='max_length',
231
+ truncation=True,
232
+ return_attention_mask=True,
233
+ return_tensors='pt',
234
+ )
235
+
236
+ input_ids = encoding['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
237
+ attention_mask = encoding['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
238
+ outputs = model(input_ids, attention_mask=attention_mask)
239
+ logits = outputs.logits
240
+ return torch.argmax(logits, dim=1).cpu().numpy()[0]
241
+ '''
242
+
243
+ with open('./deployment.py', 'w') as f:
244
+ f.write(deployment_script)
245
+
246
+ logger.info('Model deployed successfully. To use the model, run: python deployment.py')
247
+
248
+ except Exception as e:
249
+ logger.error(f"Error deploying model: {e}")
250
+ raise
251
+
252
  # Gradio Interface
253
+ def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
254
+ dataset_file = create_dataset(urls, file, text_input)
255
+
256
+ with open(dataset_file, 'r') as f:
257
+ dataset = json.load(f)
258
+
259
+ model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
260
+
261
+ deploy_model(model, tokenizer)
262
+
263
+ return dataset_file
264
 
265
  iface = gr.Interface(
266
  fn=gradio_interface,
267
  inputs=[
268
  gr.Textbox(lines=5, label="Enter comma-separated URLs"),
269
  gr.File(label="Upload file (including zip files)"),
270
+ gr.Textbox(lines=10, label="Enter or paste large text"),
271
+ gr.Textbox(label="Model name", value="distilbert-base-uncased"),
272
+ gr.Number(label="Batch size", value=8),
273
+ gr.Number(label="Epochs", value=3),
274
  ],
275
  outputs=gr.File(label="Download Combined Dataset"),
276
+ title="Dataset Creation and Model Training",
277
+ description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
278
  )
279
 
280
  # Launch the interface
281
+ if __name__ == "__main__":
282
+ iface.launch()