acecalisto3 commited on
Commit
10bd6bb
·
verified ·
1 Parent(s): c04c74a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -73
app.py CHANGED
@@ -14,8 +14,6 @@ import mimetypes
14
  from tqdm import tqdm
15
  import logging
16
  import gradio as gr
17
- import requests
18
- from bs4 import BeautifulSoup
19
 
20
  # Setup logging
21
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -23,15 +21,6 @@ logger = logging.getLogger(__name__)
23
 
24
  # --- URL and File Processing Functions ---
25
  def fetch_content(url, retries=3):
26
- """Fetch content from a given URL with retries on failure.
27
-
28
- Args:
29
- url (str): The URL to fetch content from.
30
- retries (int): Number of retries in case of failure.
31
-
32
- Returns:
33
- str: The HTML content of the page, or None if an error occurred.
34
- """
35
  for attempt in range(retries):
36
  try:
37
  response = requests.get(url, timeout=10)
@@ -44,65 +33,46 @@ def fetch_content(url, retries=3):
44
  return None
45
 
46
  def extract_text(html):
47
- """Extract text from HTML content, removing scripts and styles.
48
-
49
- Args:
50
- html (str): The HTML content to extract text from.
51
-
52
- Returns:
53
- str: The extracted text, or an empty string if the input is invalid.
54
- """
55
  if not html:
56
  logger.warning("Empty HTML content provided for extraction.")
57
  return ""
58
 
59
  soup = BeautifulSoup(html, 'html.parser')
60
-
61
- # Remove script and style elements
62
  for script in soup(["script", "style"]):
63
  script.decompose()
64
 
65
- # Get text and clean it up
66
  text = soup.get_text()
67
  lines = (line.strip() for line in text.splitlines())
68
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
69
-
70
- # Join non-empty chunks
71
  extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
72
 
73
  logger.info("Text extraction completed.")
74
  return extracted_text
 
75
  def process_urls(urls):
76
- """Process a list of URLs and return their extracted text."""
77
  dataset = []
78
  for url in tqdm(urls, desc="Fetching URLs"):
79
  if not url.startswith("http://") and not url.startswith("https://"):
80
  logger.warning(f"Invalid URL format: {url}")
81
- continue # Skip invalid URLs
82
  html = fetch_content(url)
83
  if html:
84
  text = extract_text(html)
85
- if text: # Check if text was extracted
86
- dataset.append({
87
- "source": "url",
88
- "url": url,
89
- "content": text
90
- })
91
  else:
92
  logger.warning(f"No text extracted from {url}")
93
  else:
94
  logger.error(f"Failed to fetch content from {url}")
95
- time.sleep(1) # Be polite to the server
96
  return dataset
97
 
98
  def process_file(file):
99
- """Process uploaded files (including zip files) and extract text."""
100
  dataset = []
101
  with tempfile.TemporaryDirectory() as temp_dir:
102
  if zipfile.is_zipfile(file.name):
103
  with zipfile.ZipFile(file.name, 'r') as zip_ref:
104
  zip_ref.extractall(temp_dir)
105
- # Process each extracted file
106
  for root, _, files in os.walk(temp_dir):
107
  for filename in files:
108
  filepath = os.path.join(root, filename)
@@ -110,62 +80,42 @@ def process_file(file):
110
  if mime_type and mime_type.startswith('text'):
111
  with open(filepath, 'r', errors='ignore') as f:
112
  content = f.read()
113
- if content.strip(): # Check if content is not empty
114
- dataset.append({
115
- "source": "file",
116
- "filename": filename,
117
- "content": content
118
- })
119
  else:
120
  logger.warning(f"File {filename} is empty.")
121
  else:
122
  logger.warning(f"File {filename} is not a text file.")
123
- dataset.append({
124
- "source": "file",
125
- "filename": filename,
126
- "content": "Binary file - content not extracted"
127
- })
128
  else:
129
  mime_type, _ = mimetypes.guess_type(file.name)
130
  if mime_type and mime_type.startswith('text'):
131
  content = file.read().decode('utf-8', errors='ignore')
132
- if content.strip(): # Check if content is not empty
133
- dataset.append({
134
- "source": "file",
135
- "filename": os.path.basename(file.name),
136
- "content": content
137
- })
138
  else:
139
  logger.warning(f"Uploaded file {file.name} is empty.")
140
  else:
141
  logger.warning(f"Uploaded file {file.name} is not a text file.")
142
- dataset.append({
143
- "source": "file",
144
- "filename": os.path.basename(file.name),
145
- "content": "Binary file - content not extracted"
146
- })
147
  return dataset
148
 
149
  def create_dataset(urls, file, text_input):
150
- """Create a combined dataset from URLs, uploaded files, and text input."""
151
  dataset = []
152
  if urls:
153
  dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
154
  if file:
155
  dataset.extend(process_file(file))
156
  if text_input:
157
- dataset.extend(process_text(text_input))
158
 
159
- # Log the contents of the dataset
160
  logger.info(f"Dataset created with {len(dataset)} entries.")
161
- for entry in dataset:
162
- logger.debug(f"Entry: {entry}")
163
-
164
  output_file = 'combined_dataset.json'
165
  with open(output_file, 'w') as f:
166
  json.dump(dataset, f, indent=2)
167
 
168
  return output_file
 
169
  # --- Model Training and Evaluation Functions ---
170
  class CustomDataset(torch.utils.data.Dataset):
171
  def __init__(self, data, tokenizer, max_length=512):
@@ -178,7 +128,7 @@ class CustomDataset(torch.utils.data.Dataset):
178
 
179
  def __getitem__(self, idx):
180
  try:
181
- text = self.data[idx]['content ']
182
  label = self.data[idx].get('label', 0)
183
 
184
  encoding = self.tokenizer.encode_plus(
@@ -208,7 +158,6 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
208
 
209
  dataset = CustomDataset(data, tokenizer, max_length=max_length)
210
 
211
- # Check if dataset is empty
212
  if len(dataset) == 0:
213
  logger.error("The dataset is empty. Please check the input data.")
214
  return None, None
@@ -222,8 +171,8 @@ def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_le
222
  num_train_epochs=epochs,
223
  per_device_train_batch_size=batch_size,
224
  per_device_eval_batch_size=batch_size,
225
- eval_strategy='epoch', # Set evaluation strategy
226
- save_strategy='epoch', # Ensure save strategy matches evaluation strategy
227
  learning_rate=learning_rate,
228
  save_steps=500,
229
  load_best_model_at_end=True,
@@ -301,21 +250,16 @@ def deploy_model(model, tokenizer):
301
  # Gradio Interface
302
  def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
303
  try:
304
- # Create the dataset from the provided inputs
305
  dataset_file = create_dataset(urls, file, text_input)
306
 
307
- # Load the dataset
308
  with open(dataset_file, 'r') as f:
309
  dataset = json.load(f)
310
 
311
- # Check if the dataset is empty
312
  if not dataset:
313
  return "Error: The dataset is empty. Please check your inputs."
314
 
315
- # Train the model
316
  model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
317
 
318
- # Deploy the model
319
  deploy_model(model, tokenizer)
320
 
321
  return dataset_file
@@ -338,7 +282,7 @@ iface = gr.Interface(
338
  outputs=gr.File(label="Download Combined Dataset"),
339
  title="Dataset Creation and Model Training",
340
  description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
341
- theme="default", # You can change the theme if desired
342
  )
343
 
344
  # Launch the interface
 
14
  from tqdm import tqdm
15
  import logging
16
  import gradio as gr
 
 
17
 
18
  # Setup logging
19
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
21
 
22
  # --- URL and File Processing Functions ---
23
  def fetch_content(url, retries=3):
 
 
 
 
 
 
 
 
 
24
  for attempt in range(retries):
25
  try:
26
  response = requests.get(url, timeout=10)
 
33
  return None
34
 
35
  def extract_text(html):
 
 
 
 
 
 
 
 
36
  if not html:
37
  logger.warning("Empty HTML content provided for extraction.")
38
  return ""
39
 
40
  soup = BeautifulSoup(html, 'html.parser')
 
 
41
  for script in soup(["script", "style"]):
42
  script.decompose()
43
 
 
44
  text = soup.get_text()
45
  lines = (line.strip() for line in text.splitlines())
46
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
 
 
47
  extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
48
 
49
  logger.info("Text extraction completed.")
50
  return extracted_text
51
+
52
  def process_urls(urls):
 
53
  dataset = []
54
  for url in tqdm(urls, desc="Fetching URLs"):
55
  if not url.startswith("http://") and not url.startswith("https://"):
56
  logger.warning(f"Invalid URL format: {url}")
57
+ continue
58
  html = fetch_content(url)
59
  if html:
60
  text = extract_text(html)
61
+ if text:
62
+ dataset.append({"source": "url", "url": url, "content": text})
 
 
 
 
63
  else:
64
  logger.warning(f"No text extracted from {url}")
65
  else:
66
  logger.error(f"Failed to fetch content from {url}")
67
+ time.sleep(1)
68
  return dataset
69
 
70
  def process_file(file):
 
71
  dataset = []
72
  with tempfile.TemporaryDirectory() as temp_dir:
73
  if zipfile.is_zipfile(file.name):
74
  with zipfile.ZipFile(file.name, 'r') as zip_ref:
75
  zip_ref.extractall(temp_dir)
 
76
  for root, _, files in os.walk(temp_dir):
77
  for filename in files:
78
  filepath = os.path.join(root, filename)
 
80
  if mime_type and mime_type.startswith('text'):
81
  with open(filepath, 'r', errors='ignore') as f:
82
  content = f.read()
83
+ if content.strip():
84
+ dataset.append({"source": "file", "filename": filename, "content": content})
 
 
 
 
85
  else:
86
  logger.warning(f"File {filename} is empty.")
87
  else:
88
  logger.warning(f"File {filename} is not a text file.")
89
+ dataset.append({"source": "file", "filename": filename, "content": "Binary file - content not extracted"})
 
 
 
 
90
  else:
91
  mime_type, _ = mimetypes.guess_type(file.name)
92
  if mime_type and mime_type.startswith('text'):
93
  content = file.read().decode('utf-8', errors='ignore')
94
+ if content.strip():
95
+ dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": content})
 
 
 
 
96
  else:
97
  logger.warning(f"Uploaded file {file.name} is empty.")
98
  else:
99
  logger.warning(f"Uploaded file {file.name} is not a text file.")
100
+ dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": "Binary file - content not extracted"})
 
 
 
 
101
  return dataset
102
 
103
  def create_dataset(urls, file, text_input):
 
104
  dataset = []
105
  if urls:
106
  dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
107
  if file:
108
  dataset.extend(process_file(file))
109
  if text_input:
110
+ dataset.append({"source": "input", "content": text_input})
111
 
 
112
  logger.info(f"Dataset created with {len(dataset)} entries.")
 
 
 
113
  output_file = 'combined_dataset.json'
114
  with open(output_file, 'w') as f:
115
  json.dump(dataset, f, indent=2)
116
 
117
  return output_file
118
+
119
  # --- Model Training and Evaluation Functions ---
120
  class CustomDataset(torch.utils.data.Dataset):
121
  def __init__(self, data, tokenizer, max_length=512):
 
128
 
129
  def __getitem__(self, idx):
130
  try:
131
+ text = self.data[idx]['content'] # Fixed the key to 'content'
132
  label = self.data[idx].get('label', 0)
133
 
134
  encoding = self.tokenizer.encode_plus(
 
158
 
159
  dataset = CustomDataset(data, tokenizer, max_length=max_length)
160
 
 
161
  if len(dataset) == 0:
162
  logger.error("The dataset is empty. Please check the input data.")
163
  return None, None
 
171
  num_train_epochs=epochs,
172
  per_device_train_batch_size=batch_size,
173
  per_device_eval_batch_size=batch_size,
174
+ eval_strategy='epoch',
175
+ save_strategy='epoch',
176
  learning_rate=learning_rate,
177
  save_steps=500,
178
  load_best_model_at_end=True,
 
250
  # Gradio Interface
251
  def gradio_interface(urls, file, text_input, model_name, batch_size, epochs):
252
  try:
 
253
  dataset_file = create_dataset(urls, file, text_input)
254
 
 
255
  with open(dataset_file, 'r') as f:
256
  dataset = json.load(f)
257
 
 
258
  if not dataset:
259
  return "Error: The dataset is empty. Please check your inputs."
260
 
 
261
  model, tokenizer = train_model(model_name, dataset, batch_size, epochs)
262
 
 
263
  deploy_model(model, tokenizer)
264
 
265
  return dataset_file
 
282
  outputs=gr.File(label="Download Combined Dataset"),
283
  title="Dataset Creation and Model Training",
284
  description="Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.",
285
+ theme="default",
286
  )
287
 
288
  # Launch the interface