acecalisto3 commited on
Commit
c1f083f
·
verified ·
1 Parent(s): 83b1e97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +306 -358
app.py CHANGED
@@ -1,11 +1,7 @@
1
  import json
2
  import os
3
  import torch
4
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
5
- from sklearn.metrics import accuracy_score
6
- from torch.utils.data import DataLoader
7
- from transformers import Trainer, TrainingArguments
8
- import time
9
  import requests
10
  from bs4 import BeautifulSoup
11
  import tempfile
@@ -14,401 +10,353 @@ import mimetypes
14
  from tqdm import tqdm
15
  import logging
16
  import gradio as gr
17
- from typing import List, Dict
18
- # Setup logging
19
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  logger = logging.getLogger(__name__)
21
 
22
- # --- URL and File Processing Functions ---
23
- def fetch_content(url, retries=3):
24
- for attempt in range(retries):
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
- response = requests.get(url, timeout=10)
27
- response.raise_for_status()
28
- logger.info(f"Successfully fetched content from {url}")
29
- return response.text
30
- except requests.RequestException as e:
31
- logger.error(f"Error fetching {url} (attempt {attempt + 1}/{retries}): {e}")
32
- if attempt == retries - 1:
33
- return None
34
-
35
- def extract_text(html):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  if not html:
37
- logger.warning("Empty HTML content provided for extraction.")
38
  return ""
39
 
40
  soup = BeautifulSoup(html, 'html.parser')
41
- for script in soup(["script", "style"]):
42
- script.decompose()
43
 
44
- text = soup.get_text()
 
 
 
 
 
 
 
45
  lines = (line.strip() for line in text.splitlines())
46
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
47
- extracted_text = '\n'.join(chunk for chunk in chunks if chunk)
48
 
49
- logger.info("Text extraction completed.")
50
- return extracted_text
51
-
52
- def process_urls(urls):
53
- dataset = []
54
- for url in tqdm(urls, desc="Fetching URLs"):
55
- if not url.startswith("http://") and not url.startswith("https://"):
56
- logger.warning(f"Invalid URL format: {url}")
57
- continue
58
- html = fetch_content(url)
59
- if html:
60
- text = extract_text(html)
61
- if text:
62
- dataset.append({"source": "url", "url": url, "content": text})
63
- else:
64
- logger.warning(f"No text extracted from {url}")
65
- else:
66
- logger.error(f"Failed to fetch content from {url}")
67
- time.sleep(1)
68
- return dataset
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def preprocess_bulk_text(text: str) -> str:
71
- """
72
- Preprocess bulk text input by adding commas between logical separations.
73
- Handles line breaks, slashes, and domain endings.
74
- """
75
- # First, normalize line endings
76
  text = text.replace('\r\n', '\n').replace('\r', '\n')
77
 
78
- # Split by common separators
79
- separators = [
80
- '\n', # Line breaks
81
- ' / ', # Forward slashes with spaces
82
- '/', # Forward slashes
83
- ';', # Semicolons
84
- ' - ', # Dashes with spaces
85
- '|', # Vertical bars
86
- ' ' # Double spaces
87
- ]
88
 
89
  # Replace separators with commas if not already comma-separated
90
  if ',' not in text:
91
  for separator in separators:
92
  text = text.replace(separator, ',')
93
-
94
- # Handle domain endings (e.g., .com .org .net)
95
- import re
96
  domain_pattern = r'(\.[a-z]{2,})\s+'
97
  text = re.sub(domain_pattern, r'\1,', text)
98
 
99
- # Clean up multiple commas
100
  text = re.sub(r',+', ',', text)
101
-
102
- # Remove leading/trailing commas and whitespace
103
  text = text.strip(',' + string.whitespace)
104
-
105
- # Ensure proper spacing around commas
106
  text = re.sub(r'\s*,\s*', ', ', text)
107
 
108
  return text
109
 
110
- def process_inputs(urls, file, text_input, model_name, batch_size, epochs):
111
- try:
112
- # Log the input parameters for debugging
113
- logger.info("Processing inputs with the following parameters:")
114
- logger.info(f"URLs: {urls}")
115
- logger.info(f"File: {file}")
116
- logger.info(f"Text Input: {text_input}")
117
- logger.info(f"Model Name: {model_name}")
118
- logger.info(f"Batch Size: {batch_size}")
119
- logger.info(f"Epochs: {epochs}")
120
-
121
- # Validate inputs
122
- if not urls and not file and not text_input:
123
- logger.error("No input data provided. Please provide at least one of URLs, file, or text input.")
124
- return "Error: No input data provided."
125
-
126
- # Create dataset or perform any processing logic you need
127
- output_file = create_dataset(urls, file, text_input, model_name, batch_size, epochs)
128
-
129
- # Log the successful creation of the dataset
130
- logger.info(f"Dataset created successfully: {output_file}")
131
-
132
- return output_file # Return the output file for download
133
-
134
- except Exception as e:
135
- logger.error(f"An error occurred while processing inputs: {e}")
136
- return f"Error: {str(e)}" # Return error message for user feedback
137
-
138
- # Assuming process_btn is a Gradio button
139
- process_btn.click(
140
- fn=process_inputs,
141
- inputs=[
142
- urls_input,
143
- file_input,
144
- text_input,
145
- model_name,
146
- batch_size,
147
- epochs
148
- ],
149
- outputs=download_output
150
- )
151
-
152
- def process_file(file):
153
- dataset = []
154
- with tempfile.TemporaryDirectory() as temp_dir:
155
- if zipfile.is_zipfile(file.name):
156
- with zipfile.ZipFile(file.name, 'r') as zip_ref:
157
- zip_ref.extractall(temp_dir)
158
- for root, _, files in os.walk(temp_dir):
159
- for filename in files:
160
- filepath = os.path.join(root, filename)
161
- mime_type, _ = mimetypes.guess_type(filepath)
162
- if mime_type and mime_type.startswith('text'):
163
- with open(filepath, 'r', errors='ignore') as f:
164
- content = f.read()
165
- if content.strip():
166
- dataset.append({"source": "file", "filename": filename, "content": content})
167
- else:
168
- logger.warning(f"File {filename} is empty.")
169
- else:
170
- logger.warning(f"File {filename} is not a text file.")
171
- dataset.append({"source": "file", "filename": filename, "content": "Binary file - content not extracted"})
172
- else:
173
- mime_type, _ = mimetypes.guess_type(file.name)
174
- if mime_type and mime_type.startswith('text'):
175
- content = file.read().decode('utf-8', errors='ignore')
176
- if content.strip():
177
- dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": content})
178
- else:
179
- logger.warning(f"Uploaded file {file.name} is empty.")
180
- else:
181
- logger.warning(f"Uploaded file {file.name} is not a text file.")
182
- dataset.append({"source": "file", "filename": os.path.basename(file.name), "content": "Binary file - content not extracted"})
183
- return dataset
184
-
185
- def create_dataset(urls, file, text_input):
186
- dataset = []
187
- if urls:
188
- dataset.extend(process_urls([url.strip() for url in urls.split(',') if url.strip()]))
189
- if file:
190
- dataset.extend(process_file(file))
191
- if text_input:
192
- dataset.append({"source": "input", "content": text_input})
193
-
194
- logger.info(f"Dataset created with {len(dataset)} entries.")
195
- output_file = 'combined_dataset.json'
196
- with open(output_file, 'w') as f:
197
- json.dump(dataset, f, indent=2)
198
-
199
- return output_file
200
-
201
- # --- Model Training and Evaluation Functions ---
202
- class CustomDataset(torch.utils.data.Dataset):
203
- def __init__(self, data, tokenizer, max_length=512):
204
- self.data = data
205
- self.tokenizer = tokenizer
206
- self.max_length = max_length
207
-
208
- def __len__(self):
209
- return len(self.data)
210
-
211
- def __getitem__(self, idx):
212
- try:
213
- text = self.data[idx]['content'] # Fixed the key to 'content'
214
- label = self.data[idx].get('label', 0)
215
-
216
- encoding = self.tokenizer.encode_plus(
217
- text,
218
- max_length=self.max_length,
219
- padding='max_length',
220
- truncation=True,
221
- return_attention_mask=True,
222
- return_tensors='pt',
223
- )
224
-
225
- return {
226
- 'input_ids': encoding['input_ids'].squeeze(),
227
- 'attention_mask': encoding['attention_mask'].squeeze(),
228
- 'labels': torch.tensor(label, dtype=torch.long)
229
- }
230
- except Exception as e:
231
- logger.error(f"Error in processing item {idx}: {e}")
232
- raise
233
-
234
- def train_model(model_name, data, batch_size, epochs, learning_rate=1e-5, max_length=2048):
235
- try:
236
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
237
- tokenizer = AutoTokenizer.from_pretrained(model_name)
238
- device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
239
- model.to(device)
240
 
241
- dataset = CustomDataset(data, tokenizer, max_length=max_length)
 
242
 
243
- if len(dataset) == 0:
244
- logger.error("The dataset is empty. Please check the input data.")
245
- return None, None
246
-
247
- train_size = int(0.8 * len(dataset))
248
- val_size = len(dataset) - train_size
249
- train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
250
-
251
- training_args = TrainingArguments(
252
- output_dir='./results',
253
- num_train_epochs=epochs,
254
- per_device_train_batch_size=batch_size,
255
- per_device_eval_batch_size=batch_size,
256
- eval_strategy='epoch',
257
- save_strategy='epoch',
258
- learning_rate=learning_rate,
259
- save_steps=500,
260
- load_best_model_at_end=True,
261
- metric_for_best_model='accuracy',
262
- greater_is_better=True,
263
- save_total_limit=2,
264
- seed=42,
265
- dataloader_num_workers=4,
266
- fp16=torch.cuda.is_available()
267
- )
268
-
269
- trainer = Trainer(
270
- model=model,
271
- args=training_args,
272
- train_dataset=train_dataset,
273
- eval_dataset=val_dataset,
274
- compute_metrics=lambda pred: {
275
- 'accuracy': accuracy_score(pred.label_ids, pred.predictions.argmax(-1))
276
- }
277
- )
278
-
279
- logger.info("Starting model training...")
280
- start_time = time.time()
281
- trainer.train()
282
- end_time = time.time()
283
- logger.info(f'Training time: {end_time - start_time:.2f} seconds')
284
-
285
- logger.info("Evaluating model...")
286
- eval_result = trainer.evaluate()
287
- logger.info(f'Evaluation result: {eval_result}')
288
-
289
- trainer.save_model('./model')
290
-
291
- return model, tokenizer
292
-
293
- except Exception as e:
294
- logger.error(f"Error during training: {e}")
295
- raise
296
-
297
- def deploy_model(model, tokenizer):
298
- try:
299
- model.save_pretrained('./model')
300
- tokenizer.save_pretrained('./model')
301
-
302
- deployment_script = f'''
303
- import torch
304
- from transformers import AutoModelForSequenceClassification, AutoTokenizer
305
- model = AutoModelForSequenceClassification.from_pretrained('./model')
306
- tokenizer = AutoTokenizer.from_pretrained('./model')
307
- def predict(text):
308
- encoding = tokenizer.encode_plus(
309
- text,
310
- max_length=512,
311
- padding='max_length',
312
- truncation=True,
313
- return_attention_mask=True,
314
- return_tensors='pt',
315
  )
316
- input_ids = encoding['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
317
- attention_mask = encoding['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')
318
- outputs = model(input_ids, attention_mask=attention_mask)
319
- logits = outputs.logits
320
- return torch.argmax(logits, dim=1).cpu().numpy()[0]
321
- '''
322
 
323
- with open('./deployment.py', 'w') as f:
324
- f.write(deployment_script)
 
 
 
325
 
326
- logger.info('Model deployed successfully. To use the model, run: python deployment.py')
 
 
 
 
 
327
 
328
- except Exception as e:
329
- logger.error(f"Error deploying model: {e}")
330
- raise
331
-
332
- def create_interface():
333
- """Create and return the Gradio interface"""
334
- with gr.Blocks(title="Dataset Creation and Model Training") as interface:
335
- gr.Markdown("# Dataset Creation and Model Training")
336
- gr.Markdown("Enter URLs, upload files (including zip files), and/or paste text to create a dataset and train a model.")
337
 
 
338
  with gr.Row():
339
- with gr.Column():
340
- # URL input with auto-separation
341
- urls_input = gr.Textbox(
342
- lines=5,
343
- label="Enter URLs",
344
- placeholder="Enter URLs separated by line breaks, commas, or slashes"
345
- )
346
-
347
- # File upload
348
- file_input = gr.File(
349
- label="Upload file (including zip files)",
350
- type="filepath"
351
- )
352
-
353
- # Large text input
354
- text_input = gr.Textbox(
355
- lines=10,
356
- label="Enter or paste large text",
357
- placeholder="Your text here..."
358
- )
359
-
360
- with gr.Column():
361
- # Model configuration
362
- model_name = gr.Textbox(
363
- label="Model name",
364
- value="distilbert-base-uncased"
365
- )
366
- batch_size = gr.Number(
367
- label="Batch size",
368
- value=8,
369
- precision=0,
370
- step=1
371
- )
372
- epochs = gr.Number(
373
- label="Epochs",
374
- value=3,
375
- precision=0,
376
- step=1
377
- )
378
-
379
- # Process button and output
380
- with gr.Row():
381
- process_btn = gr.Button("Process and Train")
382
- download_output = gr.File(label="Download Combined Dataset")
383
 
384
- # Event handlers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
385
  process_btn.click(
386
- fn=create_interface,
387
- inputs=[
388
- urls_input,
389
- file_input,
390
- text_input,
391
- model_name,
392
- batch_size,
393
- epochs
394
- ],
395
- outputs=download_output
396
  )
397
 
398
- # Preview processed URLs
399
- with gr.Row():
400
- preview_btn = gr.Button("Preview Processed URLs")
401
- preview_output = gr.JSON(label="Processed Items")
402
-
403
- preview_btn.click(
404
- fn=process_input,
405
- inputs=[urls_input],
406
- outputs=[preview_output]
407
- )
 
 
 
 
 
 
 
 
 
 
 
408
 
409
  return interface
410
 
411
- # Launch the interface
412
  if __name__ == "__main__":
413
- demo = create_interface()
414
- demo.launch()
 
 
 
 
 
 
 
 
 
 
1
  import json
2
  import os
3
  import torch
4
+ import string
 
 
 
 
5
  import requests
6
  from bs4 import BeautifulSoup
7
  import tempfile
 
10
  from tqdm import tqdm
11
  import logging
12
  import gradio as gr
13
+ from typing import List, Dict, Union, Optional
14
+ from urllib.parse import urlparse
15
+ import concurrent.futures
16
+ import validators
17
+ from pathlib import Path
18
+ import re
19
+
20
+ # Setup logging with more detailed configuration
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
24
+ handlers=[
25
+ logging.StreamHandler(),
26
+ logging.FileHandler('app.log')
27
+ ]
28
+ )
29
  logger = logging.getLogger(__name__)
30
 
31
+ class URLProcessor:
32
+ """Class to handle URL processing with advanced features"""
33
+
34
+ def __init__(self, timeout: int = 10, max_retries: int = 3, concurrent_requests: int = 5):
35
+ self.timeout = timeout
36
+ self.max_retries = max_retries
37
+ self.concurrent_requests = concurrent_requests
38
+ self.session = requests.Session()
39
+ # Add common headers to mimic browser behavior
40
+ self.session.headers.update({
41
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
42
+ })
43
+
44
+ def validate_url(self, url: str) -> bool:
45
+ """Validate URL format and accessibility"""
46
  try:
47
+ result = urlparse(url)
48
+ return all([result.scheme, result.netloc]) and validators.url(url)
49
+ except Exception as e:
50
+ logger.warning(f"Invalid URL format: {url} - {str(e)}")
51
+ return False
52
+
53
+ def fetch_content(self, url: str) -> Optional[str]:
54
+ """Fetch content from URL with retry mechanism"""
55
+ for attempt in range(self.max_retries):
56
+ try:
57
+ response = self.session.get(url, timeout=self.timeout)
58
+ response.raise_for_status()
59
+ return response.text
60
+ except requests.RequestException as e:
61
+ logger.error(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
62
+ if attempt == self.max_retries - 1:
63
+ return None
64
+ time.sleep(1) # Delay between retries
65
+
66
+ def process_urls(self, urls: List[str]) -> List[Dict]:
67
+ """Process multiple URLs concurrently"""
68
+ valid_urls = [url for url in urls if self.validate_url(url)]
69
+ if not valid_urls:
70
+ logger.warning("No valid URLs to process")
71
+ return []
72
+
73
+ results = []
74
+ with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
75
+ future_to_url = {executor.submit(self.fetch_content, url): url for url in valid_urls}
76
+ for future in concurrent.futures.as_completed(future_to_url):
77
+ url = future_to_url[future]
78
+ try:
79
+ html = future.result()
80
+ if html:
81
+ text = extract_text(html)
82
+ if text:
83
+ results.append({
84
+ "source": "url",
85
+ "url": url,
86
+ "content": text,
87
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
88
+ })
89
+ else:
90
+ logger.warning(f"No text content extracted from {url}")
91
+ except Exception as e:
92
+ logger.error(f"Error processing {url}: {str(e)}")
93
+
94
+ return results
95
+
96
+ def extract_text(html: str) -> str:
97
+ """Enhanced text extraction with better cleaning"""
98
  if not html:
 
99
  return ""
100
 
101
  soup = BeautifulSoup(html, 'html.parser')
 
 
102
 
103
+ # Remove unwanted elements
104
+ for element in soup(['script', 'style', 'header', 'footer', 'nav']):
105
+ element.decompose()
106
+
107
+ # Extract text with better formatting
108
+ text = soup.get_text(separator=' ')
109
+
110
+ # Clean up the text
111
  lines = (line.strip() for line in text.splitlines())
112
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
113
+ text = ' '.join(chunk for chunk in chunks if chunk)
114
 
115
+ # Remove excessive whitespace
116
+ text = re.sub(r'\s+', ' ', text)
117
+
118
+ return text.strip()
119
+
120
+ class FileProcessor:
121
+ """Class to handle file processing"""
122
+
123
+ def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
124
+ self.max_file_size = max_file_size
125
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
126
+
127
+ def is_text_file(self, filepath: str) -> bool:
128
+ """Check if file is a text file"""
129
+ try:
130
+ mime_type, _ = mimetypes.guess_type(filepath)
131
+ return mime_type and mime_type.startswith('text/')
132
+ except Exception:
133
+ return False
134
+
135
+ def process_file(self, file) -> List[Dict]:
136
+ """Process uploaded file with enhanced error handling"""
137
+ if not file:
138
+ return []
139
+
140
+ dataset = []
141
+ try:
142
+ file_size = os.path.getsize(file.name)
143
+ if file_size > self.max_file_size:
144
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
145
+ return []
146
+
147
+ with tempfile.TemporaryDirectory() as temp_dir:
148
+ if zipfile.is_zipfile(file.name):
149
+ dataset.extend(self._process_zip_file(file.name, temp_dir))
150
+ else:
151
+ dataset.extend(self._process_single_file(file))
152
+
153
+ except Exception as e:
154
+ logger.error(f"Error processing file: {str(e)}")
155
+ return []
156
+
157
+ return dataset
158
+
159
+ def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
160
+ """Process ZIP file contents"""
161
+ results = []
162
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
163
+ zip_ref.extractall(temp_dir)
164
+ for root, _, files in os.walk(temp_dir):
165
+ for filename in files:
166
+ filepath = os.path.join(root, filename)
167
+ if self.is_text_file(filepath):
168
+ try:
169
+ with open(filepath, 'r', errors='ignore') as f:
170
+ content = f.read()
171
+ if content.strip():
172
+ results.append({
173
+ "source": "file",
174
+ "filename": filename,
175
+ "content": content,
176
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
177
+ })
178
+ except Exception as e:
179
+ logger.error(f"Error reading file {filename}: {str(e)}")
180
+ return results
181
+
182
+ def _process_single_file(self, file) -> List[Dict]:
183
+ """Process single file"""
184
+ results = []
185
+ try:
186
+ content = file.read().decode('utf-8', errors='ignore')
187
+ if content.strip():
188
+ results.append({
189
+ "source": "file",
190
+ "filename": os.path.basename(file.name),
191
+ "content": content,
192
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
193
+ })
194
+ except Exception as e:
195
+ logger.error(f"Error processing single file: {str(e)}")
196
+ return results
197
 
198
  def preprocess_bulk_text(text: str) -> str:
199
+ """Enhanced text preprocessing"""
200
+ if not text:
201
+ return ""
202
+
203
+ # Normalize line endings
204
  text = text.replace('\r\n', '\n').replace('\r', '\n')
205
 
206
+ # Define separators
207
+ separators = ['\n', ' / ', '/', ';', ' - ', '|', ' ']
 
 
 
 
 
 
 
 
208
 
209
  # Replace separators with commas if not already comma-separated
210
  if ',' not in text:
211
  for separator in separators:
212
  text = text.replace(separator, ',')
213
+
214
+ # Handle domain endings
 
215
  domain_pattern = r'(\.[a-z]{2,})\s+'
216
  text = re.sub(domain_pattern, r'\1,', text)
217
 
218
+ # Clean up multiple commas and whitespace
219
  text = re.sub(r',+', ',', text)
 
 
220
  text = text.strip(',' + string.whitespace)
 
 
221
  text = re.sub(r'\s*,\s*', ', ', text)
222
 
223
  return text
224
 
225
+ def create_interface():
226
+ """Create enhanced Gradio interface"""
227
+
228
+ # Custom CSS for better styling
229
+ custom_css = """
230
+ .container { max-width: 1200px; margin: auto; padding: 20px; }
231
+ .output-panel { margin-top: 20px; }
232
+ .warning { color: #856404; background-color: #fff3cd; padding: 10px; border-radius: 4px; }
233
+ .error { color: #721c24; background-color: #f8d7da; padding: 10px; border-radius: 4px; }
234
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
+ with gr.Blocks(css=custom_css) as interface:
237
+ gr.Markdown("# Advanced URL and Text Processing Tool")
238
 
239
+ with gr.Tab("URL Input"):
240
+ url_input = gr.Textbox(
241
+ label="Enter URLs (comma-separated or one per line)",
242
+ placeholder="https://example1.com, https://example2.com",
243
+ lines=5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  )
 
 
 
 
 
 
245
 
246
+ with gr.Tab("File Input"):
247
+ file_input = gr.File(
248
+ label="Upload text file or ZIP archive",
249
+ file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
250
+ )
251
 
252
+ with gr.Tab("Text Input"):
253
+ text_input = gr.Textbox(
254
+ label="Enter text directly",
255
+ placeholder="Enter your text here...",
256
+ lines=5
257
+ )
258
 
259
+ # Process button with loading state
260
+ process_btn = gr.Button("Process Input", variant="primary")
 
 
 
 
 
 
 
261
 
262
+ # Output components
263
  with gr.Row():
264
+ output_file = gr.File(label="Processed Dataset")
265
+ output_text = gr.Textbox(
266
+ label="Processing Results",
267
+ lines=3,
268
+ interactive=False
269
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ def process_all_inputs(urls, file, text):
272
+ """Process all input types with progress tracking"""
273
+ try:
274
+ dataset = []
275
+
276
+ # Process URLs
277
+ if urls:
278
+ url_processor = URLProcessor()
279
+ url_list = [u.strip() for u in urls.split(',') if u.strip()]
280
+ dataset.extend(url_processor.process_urls(url_list))
281
+
282
+ # Process files
283
+ if file:
284
+ file_processor = FileProcessor()
285
+ dataset.extend(file_processor.process_file(file))
286
+
287
+ # Process text input
288
+ if text:
289
+ processed_text = preprocess_bulk_text(text)
290
+ if processed_text:
291
+ dataset.append({
292
+ "source": "input",
293
+ "content": processed_text,
294
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
295
+ })
296
+
297
+ if not dataset:
298
+ return [None, "No valid data to process. Please check your inputs."]
299
+
300
+ # Save results
301
+ output_file = 'processed_dataset.json'
302
+ with open(output_file, 'w', encoding='utf-8') as f:
303
+ json.dump(dataset, f, indent=2, ensure_ascii=False)
304
+
305
+ # Generate summary
306
+ summary = f"""
307
+ Processing completed successfully!
308
+ - URLs processed: {sum(1 for d in dataset if d['source'] == 'url')}
309
+ - Files processed: {sum(1 for d in dataset if d['source'] == 'file')}
310
+ - Text inputs processed: {sum(1 for d in dataset if d['source'] == 'input')}
311
+ """
312
+
313
+ return [output_file, summary]
314
+
315
+ except Exception as e:
316
+ error_msg = f"Error during processing: {str(e)}"
317
+ logger.error(error_msg)
318
+ return [None, error_msg]
319
+
320
+ # Connect the interface
321
  process_btn.click(
322
+ fn=process_all_inputs,
323
+ inputs=[url_input, file_input, text_input],
324
+ outputs=[output_file, output_text]
 
 
 
 
 
 
 
325
  )
326
 
327
+ # Add comprehensive instructions
328
+ gr.Markdown("""
329
+ ## Instructions
330
+ 1. **URL Input**:
331
+ - Enter URLs separated by commas or new lines
332
+ - URLs must start with http:// or https://
333
+ - Invalid URLs will be skipped
334
+
335
+ 2. **File Input**:
336
+ - Upload text files or ZIP archives
337
+ - Supported formats: .txt, .zip, .md, .csv, .json, .xml
338
+ - Maximum file size: 10MB
339
+
340
+ 3. **Text Input**:
341
+ - Directly enter or paste text
342
+ - Text will be automatically formatted
343
+
344
+ 4. Click 'Process Input' to generate the dataset
345
+
346
+ The tool will combine all valid inputs into a single JSON dataset file.
347
+ """)
348
 
349
  return interface
350
 
 
351
  if __name__ == "__main__":
352
+ # Initialize mimetypes
353
+ mimetypes.init()
354
+
355
+ # Create and launch the interface
356
+ interface = create_interface()
357
+ interface.launch(
358
+ share=True,
359
+ server_name="0.0.0.0",
360
+ server_port=7860,
361
+ debug=True
362
+ )