acecalisto3 commited on
Commit
ecc3973
·
verified ·
1 Parent(s): 98cf6a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -49
app.py CHANGED
@@ -14,44 +14,84 @@ import mimetypes
14
  from tqdm import tqdm
15
  import logging
16
  import gradio as gr
 
 
17
 
18
  # Setup logging
19
- logging.basicConfig(level=logging.INFO)
20
  logger = logging.getLogger(__name__)
21
 
22
  # --- URL and File Processing Functions ---
23
- def fetch_content(url):
24
- """Fetch content from a given URL."""
25
- try:
26
- response = requests.get(url, timeout=10)
27
- response.raise_for_status()
28
- return response.text
29
- except requests.RequestException as e:
30
- logger.error(f"Error fetching {url}: {e}")
31
- return None
 
 
 
 
 
 
 
 
 
 
 
32
 
33
  def extract_text(html):
34
- """Extract text from HTML content."""
 
 
 
 
 
 
 
 
 
 
 
35
  soup = BeautifulSoup(html, 'html.parser')
 
 
36
  for script in soup(["script", "style"]):
37
  script.decompose()
 
 
38
  text = soup.get_text()
39
  lines = (line.strip() for line in text.splitlines())
40
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
41
- return '\n'.join(chunk for chunk in chunks if chunk)
42
-
 
 
 
 
43
  def process_urls(urls):
44
  """Process a list of URLs and return their extracted text."""
45
  dataset = []
46
  for url in tqdm(urls, desc="Fetching URLs"):
 
 
 
47
  html = fetch_content(url)
48
  if html:
49
  text = extract_text(html)
50
- dataset.append({
51
- "source": "url",
52
- "url": url,
53
- "content": text
54
- })
 
 
 
 
 
55
  time.sleep(1) # Be polite to the server
56
  return dataset
57
 
@@ -70,12 +110,16 @@ def process_file(file):
70
  if mime_type and mime_type.startswith('text'):
71
  with open(filepath, 'r', errors='ignore') as f:
72
  content = f.read()
73
- dataset.append({
74
- "source": "file",
75
- "filename": filename,
76
- "content": content
77
- })
 
 
 
78
  else:
 
79
  dataset.append({
80
  "source": "file",
81
  "filename": filename,
@@ -85,12 +129,16 @@ def process_file(file):
85
  mime_type, _ = mimetypes.guess_type(file.name)
86
  if mime_type and mime_type.startswith('text'):
87
  content = file.read().decode('utf-8', errors='ignore')
88
- dataset.append({
89
- "source": "file",
90
- "filename": os.path.basename(file.name),
91
- "content": content
92
- })
 
 
 
93
  else:
 
94
  dataset.append({
95
  "source": "file",
96
  "filename": os.path.basename(file.name),
@@ -98,13 +146,6 @@ def process_file(file):
98
  })
99
  return dataset
100
 
101
- def process_text(text):
102
- """Process raw text input."""
103
- return [{
104
- "source": "text_input",
105
- "content": text
106
- }]
107
-
108
  def create_dataset(urls, file, text_input):
109
  """Create a combined dataset from URLs, uploaded files, and text input."""
110
  dataset = []
@@ -115,12 +156,16 @@ def create_dataset(urls, file, text_input):
115
  if text_input:
116
  dataset.extend(process_text(text_input))
117
 
 
 
 
 
 
118
  output_file = 'combined_dataset.json'
119
  with open(output_file, 'w') as f:
120
  json.dump(dataset, f, indent=2)
121
 
122
  return output_file
123
-
124
  # --- Model Training and Evaluation Functions ---
125
  class CustomDataset(torch.utils.data.Dataset):
126
  def __init__(self, data, tokenizer, max_length=512):
@@ -162,6 +207,12 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
162
  model.to(device)
163
 
164
  dataset = CustomDataset(data, tokenizer, max_length=max_length)
 
 
 
 
 
 
165
  train_size = int(0.8 * len(dataset))
166
  val_size = len(dataset) - train_size
167
  train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
@@ -171,7 +222,8 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
171
  num_train_epochs=epochs,
172
  per_device_train_batch_size=batch_size,
173
  per_device_eval_batch_size=batch_size,
174
- evaluation_strategy='epoch',
 
175
  learning_rate=learning_rate,
176
  save_steps=500,
177
  load_best_model_at_end=True,
@@ -248,30 +300,45 @@ def deploy_model(model, tokenizer):
248
 
249
  # Gradio Interface
250
  def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
251
- dataset_file = create_dataset(urls, file, text_input)
252
-
253
- with open(dataset_file, 'r') as f:
254
- dataset = json.load(f)
 
 
 
 
 
 
 
255
 
256
- model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
 
257
 
258
- deploy_model(model, tokenizer)
 
259
 
260
- return dataset_file
 
 
 
 
261
 
 
262
  iface = gr.Interface(
263
  fn=gradio_interface,
264
  inputs=[
265
- gr.Textbox(lines=5, label="Enter comma-separated URLs"),
266
  gr.File(label="Upload file (including zip files)", type="filepath"),
267
- gr.Textbox(lines=10, label="Enter or paste large text"),
268
  gr.Textbox(label="Model name", value="distilbert-base-uncased"),
269
- gr.Number(label="Batch size", value=8),
270
- gr.Number(label="Epochs", value=3),
271
  ],
272
- outputs=gr.File(label="Download Combined Dataset"),
273
  title="Dataset Creation and Model Training",
274
  description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
 
275
  )
276
 
277
  # Launch the interface
 
14
  from tqdm import tqdm
15
  import logging
16
  import gradio as gr
17
+ import requests
18
+ from bs4 import BeautifulSoup
19
 
20
  # Setup logging
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
  logger = logging.getLogger(__name__)
23
 
24
  # --- URL and File Processing Functions ---
25
+ def fetch_content(url, retries=3):
26
+ """Fetch content from a given URL with retries on failure.
27
+
28
+ Args:
29
+ url (str): The URL to fetch content from.
30
+ retries (int): Number of retries in case of failure.
31
+
32
+ Returns:
33
+ str: The HTML content of the page, or None if an error occurred.
34
+ """
35
+ for attempt in range(retries):
36
+ try:
37
+ response = requests.get(url, timeout=10)
38
+ response.raise_for_status()
39
+ logger.info(f"Successfully fetched content from {url}")
40
+ return response.text
41
+ except requests.RequestException as e:
42
+ logger.error(f"Error fetching {url} (attempt {attempt + 1}/{retries}): {e}")
43
+ if attempt == retries - 1:
44
+ return None
45
 
46
  def extract_text(html):
47
+ """Extract text from HTML content, removing scripts and styles.
48
+
49
+ Args:
50
+ html (str): The HTML content to extract text from.
51
+
52
+ Returns:
53
+ str: The extracted text, or an empty string if the input is invalid.
54
+ """
55
+ if not html:
56
+ logger.warning("Empty HTML content provided for extraction.")
57
+ return ""
58
+
59
  soup = BeautifulSoup(html, 'html.parser')
60
+
61
+ # Remove script and style elements
62
  for script in soup(["script", "style"]):
63
  script.decompose()
64
+
65
+ # Get text and clean it up
66
  text = soup.get_text()
67
  lines = (line.strip() for line in text.splitlines())
68
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
69
+
70
+ # Join non-empty chunks
71
+ extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
72
+
73
+ logger.info("Text extraction completed.")
74
+ return extracted_text
75
  def process_urls(urls):
76
  """Process a list of URLs and return their extracted text."""
77
  dataset = []
78
  for url in tqdm(urls, desc="Fetching URLs"):
79
+ if not url.startswith("http://") and not url.startswith("https://"):
80
+ logger.warning(f"Invalid URL format: {url}")
81
+ continue # Skip invalid URLs
82
  html = fetch_content(url)
83
  if html:
84
  text = extract_text(html)
85
+ if text: # Check if text was extracted
86
+ dataset.append({
87
+ "source": "url",
88
+ "url": url,
89
+ "content": text
90
+ })
91
+ else:
92
+ logger.warning(f"No text extracted from {url}")
93
+ else:
94
+ logger.error(f"Failed to fetch content from {url}")
95
  time.sleep(1) # Be polite to the server
96
  return dataset
97
 
 
110
  if mime_type and mime_type.startswith('text'):
111
  with open(filepath, 'r', errors='ignore') as f:
112
  content = f.read()
113
+ if content.strip(): # Check if content is not empty
114
+ dataset.append({
115
+ "source": "file",
116
+ "filename": filename,
117
+ "content": content
118
+ })
119
+ else:
120
+ logger.warning(f"File {filename} is empty.")
121
  else:
122
+ logger.warning(f"File {filename} is not a text file.")
123
  dataset.append({
124
  "source": "file",
125
  "filename": filename,
 
129
  mime_type, _ = mimetypes.guess_type(file.name)
130
  if mime_type and mime_type.startswith('text'):
131
  content = file.read().decode('utf-8', errors='ignore')
132
+ if content.strip(): # Check if content is not empty
133
+ dataset.append({
134
+ "source": "file",
135
+ "filename": os.path.basename(file.name),
136
+ "content": content
137
+ })
138
+ else:
139
+ logger.warning(f"Uploaded file {file.name} is empty.")
140
  else:
141
+ logger.warning(f"Uploaded file {file.name} is not a text file.")
142
  dataset.append({
143
  "source": "file",
144
  "filename": os.path.basename(file.name),
 
146
  })
147
  return dataset
148
 
 
 
 
 
 
 
 
149
  def create_dataset(urls, file, text_input):
150
  """Create a combined dataset from URLs, uploaded files, and text input."""
151
  dataset = []
 
156
  if text_input:
157
  dataset.extend(process_text(text_input))
158
 
159
+ # Log the contents of the dataset
160
+ logger.info(f"Dataset created with {len(dataset)} entries.")
161
+ for entry in dataset:
162
+ logger.debug(f"Entry: {entry}")
163
+
164
  output_file = 'combined_dataset.json'
165
  with open(output_file, 'w') as f:
166
  json.dump(dataset, f, indent=2)
167
 
168
  return output_file
 
169
  # --- Model Training and Evaluation Functions ---
170
  class CustomDataset(torch.utils.data.Dataset):
171
  def __init__(self, data, tokenizer, max_length=512):
 
207
  model.to(device)
208
 
209
  dataset = CustomDataset(data, tokenizer, max_length=max_length)
210
+
211
+ # Check if dataset is empty
212
+ if len(dataset) == 0:
213
+ logger.error("The dataset is empty. Please check the input data.")
214
+ return None, None
215
+
216
  train_size = int(0.8 * len(dataset))
217
  val_size = len(dataset) - train_size
218
  train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
 
222
  num_train_epochs=epochs,
223
  per_device_train_batch_size=batch_size,
224
  per_device_eval_batch_size=batch_size,
225
+ evaluation_strategy='epoch', # Set evaluation strategy
226
+ save_strategy='epoch', # Ensure save strategy matches evaluation strategy
227
  learning_rate=learning_rate,
228
  save_steps=500,
229
  load_best_model_at_end=True,
 
300
 
301
  # Gradio Interface
302
  def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
303
+ try:
304
+ # Create the dataset from the provided inputs
305
+ dataset_file = create_dataset(urls, file, text_input)
306
+
307
+ # Load the dataset
308
+ with open(dataset_file, 'r') as f:
309
+ dataset = json.load(f)
310
+
311
+ # Check if the dataset is empty
312
+ if not dataset:
313
+ return "Error: The dataset is empty. Please check your inputs."
314
 
315
+ # Train the model
316
+ model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
317
 
318
+ # Deploy the model
319
+ deploy_model(model, tokenizer)
320
 
321
+ return dataset_file
322
+
323
+ except Exception as e:
324
+ logger.error(f"Error in gradio_interface: {e}")
325
+ return f"An error occurred: {str(e)}"
326
 
327
+ # Gradio Interface Setup
328
  iface = gr.Interface(
329
  fn=gradio_interface,
330
  inputs=[
331
+ gr.Textbox(lines=5, label="Enter comma-separated URLs", placeholder="http://example.com, https://example.org"),
332
  gr.File(label="Upload file (including zip files)", type="filepath"),
333
+ gr.Textbox(lines=10, label="Enter or paste large text", placeholder="Your text here..."),
334
  gr.Textbox(label="Model name", value="distilbert-base-uncased"),
335
+ gr.Number(label="Batch size", value=8, precision=0, step=1),
336
+ gr.Number(label="Epochs", value=3, precision=0, step=1),
337
  ],
338
+ outputs=gr.File(label="Download Combined Dataset"),
339
  title="Dataset Creation and Model Training",
340
  description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
341
+ theme="default", # You can change the theme if desired
342
  )
343
 
344
  # Launch the interface