acecalisto3 commited on
Commit
200e562
·
verified ·
1 Parent(s): 2713a9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +355 -303
app.py CHANGED
@@ -1,364 +1,416 @@
1
  import json
2
  import os
3
- import torch
4
- import string
5
- import requests
6
- from bs4 import BeautifulSoup
7
- import tempfile
8
- import zipfile
9
- import mimetypes
10
- from tqdm import tqdm
11
  import logging
12
- import gradio as gr
13
- from typing import List, Dict, Union, Optional
14
- from urllib.parse import urlparse
15
  import concurrent.futures
16
- import validators
 
17
  from pathlib import Path
18
- import re
19
 
20
- # Setup logging with more detailed configuration
 
 
 
 
 
 
 
 
 
21
  logging.basicConfig(
22
  level=logging.INFO,
23
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
24
  handlers=[
25
  logging.StreamHandler(),
26
- logging.FileHandler('app.log')
27
  ]
28
  )
29
  logger = logging.getLogger(__name__)
30
 
31
- class URLProcessor:
32
- """Class to handle URL processing with advanced features"""
33
 
34
- def __init__(self, timeout: int = 10, max_retries: int = 3, concurrent_requests: int = 5):
 
 
 
 
 
 
35
  self.timeout = timeout
36
  self.max_retries = max_retries
37
  self.concurrent_requests = concurrent_requests
 
 
 
 
 
 
38
  self.session = requests.Session()
39
- # Add common headers to mimic browser behavior
40
  self.session.headers.update({
41
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
 
 
 
42
  })
43
 
44
- def validate_url(self, url: str) -> bool:
45
- """Validate URL format and accessibility"""
46
- try:
47
- result = urlparse(url)
48
- is_valid = all([result.scheme, result.netloc]) and validators.url(url)
49
- logger.info(f"Validating URL: {url} - Result: {is_valid}")
50
- return is_valid
51
- except Exception as e:
52
- logger.warning(f"Invalid URL format: {url} - {str(e)}")
53
- return False
54
-
55
- def fetch_content(self, url: str) -> Optional[str]:
56
- """Fetch content from URL with retry mechanism"""
57
- for attempt in range(self.max_retries):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  try:
59
- response = self.session.get(url, timeout=self.timeout)
60
- response.raise_for_status()
61
- return response.text
 
 
 
 
 
 
 
62
  except requests.RequestException as e:
63
- logger.error(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
64
- if attempt == self.max_retries - 1:
65
- return None
66
- time.sleep(1) # Delay between retries
67
-
68
- def process_urls(self, urls: List[str]) -> List[Dict]:
69
- """Process multiple URLs concurrently"""
70
- valid_urls = [url for url in urls if self.validate_url(url)]
71
- if not valid_urls:
72
- logger.warning("No valid URLs to process")
73
- return []
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- results = []
76
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.concurrent_requests) as executor:
77
- future_to_url = {executor.submit(self.fetch_content, url): url for url in valid_urls}
78
- for future in concurrent.futures.as_completed(future_to_url):
79
- url = future_to_url[future]
80
  try:
81
- html = future.result()
82
- if html:
83
- text = extract_text(html)
84
- if text:
85
- results.append({
86
- "source": "url",
87
- "url": url,
88
- "content": text,
89
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
90
- })
91
- else:
92
- logger.warning(f"No text content extracted from {url}")
93
- except Exception as e:
94
- logger.error(f"Error processing {url}: {str(e)}")
95
-
96
- return results
97
-
98
- def extract_text(html: str) -> str:
99
- """Enhanced text extraction with better cleaning"""
100
- if not html:
101
- return ""
102
 
103
- soup = BeautifulSoup(html, 'html.parser')
 
104
 
105
- # Remove unwanted elements
106
- for element in soup(['script', 'style', 'header', 'footer', 'nav']):
107
- element.decompose()
108
-
109
- # Extract text with better formatting
110
- text = soup.get_text(separator=' ')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- # Clean up the text
113
- lines = (line.strip() for line in text.splitlines())
114
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
115
- text = ' '.join(chunk for chunk in chunks if chunk)
 
 
 
 
 
 
116
 
117
- # Remove excessive whitespace
118
- text = re.sub(r'\s+', ' ', text)
 
 
 
 
 
 
 
 
 
 
 
119
 
120
- return text.strip()
121
-
122
- class FileProcessor:
123
- """Class to handle file processing"""
 
124
 
125
- def __init__(self, max_file_size: int = 10 * 1024 * 1024): # 10MB default
126
- self.max_file_size = max_file_size
127
- self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
128
-
129
- def is_text_file(self, filepath: str) -> bool:
130
- """Check if file is a text file"""
131
- try:
132
- mime_type, _ = mimetypes.guess_type(filepath)
133
- return mime_type and mime_type.startswith('text/')
134
- except Exception:
135
- return False
136
-
137
- def process_file(self, file) -> List[Dict]:
138
- """Process uploaded file with enhanced error handling"""
139
- if not file:
140
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
- dataset = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  try:
144
- file_size = os.path.getsize(file.name)
145
- if file_size > self.max_file_size:
146
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
147
- return []
148
-
149
- with tempfile.TemporaryDirectory() as temp_dir:
150
- if zipfile.is_zipfile(file.name):
151
- dataset.extend(self._process_zip_file(file.name, temp_dir))
152
  else:
153
- dataset.extend(self._process_single_file(file))
154
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
- logger.error(f"Error processing file: {str(e)}")
157
- return []
158
 
159
- return dataset
160
-
161
- def _process_zip_file(self, zip_path: str, temp_dir: str) -> List[Dict]:
162
- """Process ZIP file contents"""
163
- results = []
164
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
165
- zip_ref.extractall(temp_dir)
166
- for root, _, files in os.walk(temp_dir):
167
- for filename in files:
168
- filepath = os.path.join(root, filename)
169
- if self.is_text_file(filepath):
170
- try:
171
- with open(filepath, 'r', errors='ignore') as f:
172
- content = f.read()
173
- if content.strip():
174
- results.append({
175
- "source": "file",
176
- "filename": filename,
177
- "content": content,
178
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
179
- })
180
- except Exception as e:
181
- logger.error(f"Error reading file {filename}: {str(e)}")
182
- return results
183
-
184
- def _process_single_file(self, file) -> List[Dict]:
185
- """Process single file"""
186
- results = []
187
  try:
188
- content = file.read().decode('utf-8', errors='ignore')
189
- if content.strip():
190
- results.append({
191
- "source": "file",
192
- "filename": os.path.basename(file.name),
193
- "content": content,
194
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
195
- })
 
 
 
 
 
 
 
 
196
  except Exception as e:
197
- logger.error(f"Error processing single file: {str(e)}")
198
- return results
199
-
200
- def preprocess_bulk_text(text: str) -> str:
201
- """Enhanced text preprocessing"""
202
- if not text:
203
- return ""
204
-
205
- # Normalize line endings
206
- text = text.replace('\r\n', '\n').replace('\r', '\n')
207
-
208
- # Define separators
209
- separators = ['\n', ' / ', '/', ';', ' - ', '|', ' ']
210
-
211
- # Replace separators with commas if not already comma-separated
212
- if ',' not in text:
213
- for separator in separators:
214
- text = text.replace(separator, ',')
215
-
216
- # Handle domain endings
217
- domain_pattern = r'(\.[a-z]{2,})\s+'
218
- text = re.sub(domain_pattern, r'\1,', text)
219
-
220
- # Clean up multiple commas and whitespace
221
- text = re.sub(r',+', ',', text)
222
- text = text.strip(',' + string.whitespace)
223
- text = re.sub(r'\s*,\s*', ', ', text)
224
-
225
- return text
226
 
227
  def create_interface():
228
- """Create enhanced Gradio interface"""
 
229
 
230
- # Custom CSS for better styling
231
- custom_css = """
232
- .container { max-width: 1200px; margin: auto; padding: 20px; }
233
- .output-panel { margin-top: 20px; }
234
- .warning { color: #856404; background-color: #fff3cd; padding: 10px; border-radius: 4px; }
235
- .error { color: #721c24; background-color: #f8d7da; padding: 10px; border-radius: 4px; }
236
- """
237
-
238
- with gr.Blocks(css=custom_css) as interface:
239
- gr.Markdown("# Advanced URL and Text Processing Tool")
240
 
241
- with gr.Tab("URL Input"):
242
- url_input = gr.Textbox(
243
- label="Enter URLs (comma-separated or one per line)",
244
- placeholder="https://example1.com, https://example2.com",
245
- lines=5
246
- )
247
-
248
- with gr.Tab("File Input"):
249
- file_input = gr.File(
250
- label="Upload text file or ZIP archive",
251
- file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
252
- )
253
-
254
- with gr.Tab("Text Input"):
255
- text_input = gr.Textbox(
256
- label="Enter text directly",
257
- placeholder="Enter your text here...",
258
- lines=5
259
- )
260
-
261
- # Process button with loading state
262
- process_btn = gr.Button("Process Input", variant="primary")
263
-
264
- # Output components
265
  with gr.Row():
266
- output_file = gr.File(label="Processed Dataset")
267
- output_text = gr.Textbox(
268
- label="Processing Results",
269
- lines=3,
270
- interactive=False
271
- )
272
-
273
- def process_all_inputs(urls, file, text):
274
- """Process all input types with progress tracking"""
275
- try:
276
- dataset = []
277
 
278
- # Process URLs
279
- if urls:
280
- url_processor = URLProcessor()
281
- url_list = [u.strip() for u in urls.split(',') if u.strip()]
282
- dataset.extend(url_processor.process_urls(url_list))
283
-
284
- # Process files
285
- if file:
286
- file_processor = FileProcessor()
287
- dataset.extend(file_processor.process_file(file))
288
-
289
- # Process text input
290
- if text:
291
- processed_text = preprocess_bulk_text(text)
292
- if processed_text:
293
- dataset.append({
294
- "source": "input",
295
- "content": processed_text,
296
- "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
297
- })
298
-
299
- if not dataset:
300
- return [None, "No valid data to process. Please check your inputs."]
301
-
302
- # Save results
303
- output_file = 'processed_dataset.json'
304
- with open(output_file, 'w', encoding='utf-8') as f:
305
- json.dump(dataset, f, indent=2, ensure_ascii=False)
306
-
307
- # Generate summary
308
- summary = f"""
309
- Processing completed successfully!
310
- - URLs processed: {sum(1 for d in dataset if d['source'] == 'url')}
311
- - Files processed: {sum(1 for d in dataset if d['source'] == 'file')}
312
- - Text inputs processed: {sum(1 for d in dataset if d['source'] == 'input')}
313
- """
314
-
315
- return [output_file, summary]
316
-
317
- except Exception as e:
318
- error_msg = f"Error during processing: {str(e)}"
319
- logger.error(error_msg)
320
- return [None, error_msg]
321
-
322
- # Connect the interface
323
- process_btn.click(
324
- fn=process_all_inputs,
325
- inputs=[url_input, file_input, text_input],
326
- outputs=[output_file, output_text]
327
- )
328
-
329
- # Add comprehensive instructions
330
- gr.Markdown("""
331
- ## Instructions
332
- 1. **URL Input**:
333
- - Enter URLs separated by commas or new lines
334
- - URLs must start with http:// or https://
335
- - Invalid URLs will be skipped
336
 
337
- 2. **File Input**:
338
- - Upload text files or ZIP archives
339
- - Supported formats: .txt, .zip, .md, .csv, .json, .xml
340
- - Maximum file size: 10MB
 
 
 
341
 
342
- 3. **Text Input**:
343
- - Directly enter or paste text
344
- - Text will be automatically formatted
345
 
346
- 4. Click 'Process Input' to generate the dataset
 
 
 
 
347
 
348
- The tool will combine all valid inputs into a single JSON dataset file.
349
- """)
350
-
 
 
 
351
  return interface
352
 
353
  if __name__ == "__main__":
354
  # Initialize mimetypes
355
  mimetypes.init()
356
 
357
- # Create and launch the interface
358
  interface = create_interface()
359
  interface.launch(
360
- share=True,
361
  server_name="0.0.0.0",
362
  server_port=7860,
 
363
  debug=True
364
- )
 
1
  import json
2
  import os
3
+ import re
4
+ import time
 
 
 
 
 
 
5
  import logging
6
+ import mimetypes
 
 
7
  import concurrent.futures
8
+ import string
9
+ from typing import List, Dict, Optional, Union, Any
10
  from pathlib import Path
11
+ from urllib.parse import urlparse
12
 
13
+ import requests
14
+ import validators
15
+ import gradio as gr
16
+ import torch
17
+ import cachetools
18
+ from bs4 import BeautifulSoup
19
+ from fake_useragent import UserAgent
20
+ from ratelimit import limits, sleep_and_retry
21
+
22
+ # Advanced Logging Configuration
23
  logging.basicConfig(
24
  level=logging.INFO,
25
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
26
  handlers=[
27
  logging.StreamHandler(),
28
+ logging.FileHandler('app_advanced.log', encoding='utf-8')
29
  ]
30
  )
31
  logger = logging.getLogger(__name__)
32
 
33
+ class AdvancedURLProcessor:
34
+ """Enhanced URL processing with advanced features"""
35
 
36
+ def __init__(
37
+ self,
38
+ timeout: int = 15,
39
+ max_retries: int = 3,
40
+ concurrent_requests: int = 5,
41
+ cache_size: int = 100
42
+ ):
43
  self.timeout = timeout
44
  self.max_retries = max_retries
45
  self.concurrent_requests = concurrent_requests
46
+ self.ua = UserAgent()
47
+
48
+ # Implement multilevel caching
49
+ self.url_cache = cachetools.LRUCache(maxsize=cache_size)
50
+ self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600) # 1-hour cache
51
+
52
  self.session = requests.Session()
 
53
  self.session.headers.update({
54
+ 'User-Agent': self.ua.random,
55
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
56
+ 'Accept-Language': 'en-US,en;q=0.5',
57
+ 'Connection': 'keep-alive'
58
  })
59
 
60
+ @sleep_and_retry
61
+ @limits(calls=10, period=60) # Rate limiting: 10 calls per minute
62
+ def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
63
+ """Enhanced URL validation with comprehensive checks"""
64
+ try:
65
+ # Check cache first
66
+ if url in self.url_cache:
67
+ return self.url_cache[url]
68
+
69
+ # Comprehensive URL validation
70
+ result = urlparse(url)
71
+ validation_result = {
72
+ 'is_valid': False,
73
+ 'message': 'Invalid URL',
74
+ 'scheme': result.scheme,
75
+ 'netloc': result.netloc
76
+ }
77
+
78
+ if not all([result.scheme, result.netloc]):
79
+ validation_result['message'] = 'Missing scheme or network location'
80
+ return validation_result
81
+
82
+ # Use validators for additional checks
83
+ if not validators.url(url):
84
+ validation_result['message'] = 'URL format validation failed'
85
+ return validation_result
86
+
87
+ # Perform HEAD request for accessibility
88
  try:
89
+ response = self.session.head(
90
+ url,
91
+ timeout=self.timeout,
92
+ allow_redirects=True
93
+ )
94
+
95
+ validation_result['is_valid'] = response.status_code in [200, 301, 302]
96
+ validation_result['status_code'] = response.status_code
97
+ validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
98
+
99
  except requests.RequestException as e:
100
+ validation_result['message'] = f"Connection error: {str(e)}"
101
+
102
+ # Cache the result
103
+ self.url_cache[url] = validation_result
104
+ return validation_result
105
+
106
+ except Exception as e:
107
+ logger.error(f"Unexpected error validating URL {url}: {e}")
108
+ return {
109
+ 'is_valid': False,
110
+ 'message': f"Unexpected validation error: {str(e)}"
111
+ }
112
+
113
+ @sleep_and_retry
114
+ @limits(calls=10, period=60)
115
+ async def fetch_content(self, url: str) -> Optional[str]:
116
+ """Fetch content from URL with retry mechanism and caching"""
117
+ try:
118
+ # Check content cache first
119
+ if url in self.content_cache:
120
+ logger.info(f"Cache hit for URL: {url}")
121
+ return self.content_cache[url]
122
 
123
+ for attempt in range(self.max_retries):
 
 
 
 
124
  try:
125
+ response = self.session.get(url, timeout=self.timeout)
126
+ response.raise_for_status()
127
+ content = response.text
128
+
129
+ # Cache the content
130
+ self.content_cache[url] = content
131
+ return content
132
+
133
+ except requests.RequestException as e:
134
+ logger.warning(f"Attempt {attempt + 1}/{self.max_retries} failed for {url}: {str(e)}")
135
+ if attempt == self.max_retries - 1:
136
+ raise
137
+ time.sleep(1) # Delay between retries
138
+
139
+ except Exception as e:
140
+ logger.error(f"Error fetching content from {url}: {e}")
141
+ return None
 
 
 
 
142
 
143
+ class ContentExtractor:
144
+ """Advanced content extraction and processing"""
145
 
146
+ def __init__(self):
147
+ self.cleaners = [
148
+ self._remove_scripts,
149
+ self._remove_styles,
150
+ self._remove_special_chars,
151
+ self._normalize_whitespace
152
+ ]
153
+
154
+ def extract_text(self, html: str, url: str = "") -> Dict[str, Union[str, Dict]]:
155
+ """Extract and clean text content with metadata"""
156
+ try:
157
+ if not html:
158
+ return {
159
+ "success": False,
160
+ "content": "",
161
+ "metadata": {"error": "Empty HTML content"}
162
+ }
163
+
164
+ soup = BeautifulSoup(html, 'html.parser')
165
+
166
+ # Extract metadata
167
+ metadata = self._extract_metadata(soup, url)
168
+
169
+ # Clean content
170
+ content = self._process_content(soup)
171
+
172
+ return {
173
+ "success": True,
174
+ "content": content,
175
+ "metadata": metadata
176
+ }
177
+
178
+ except Exception as e:
179
+ logger.error(f"Content extraction error for {url}: {e}")
180
+ return {
181
+ "success": False,
182
+ "content": "",
183
+ "metadata": {"error": str(e)}
184
+ }
185
 
186
+ def _extract_metadata(self, soup: BeautifulSoup, url: str) -> Dict:
187
+ """Extract page metadata"""
188
+ metadata = {
189
+ "title": self._get_title(soup),
190
+ "description": self._get_meta_description(soup),
191
+ "keywords": self._get_meta_keywords(soup),
192
+ "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
193
+ "url": url
194
+ }
195
+ return metadata
196
 
197
+ def _process_content(self, soup: BeautifulSoup) -> str:
198
+ """Process and clean content through multiple passes"""
199
+ for cleaner in self.cleaners:
200
+ soup = cleaner(soup)
201
+
202
+ # Extract text with preserved structure
203
+ lines = []
204
+ for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
205
+ text = element.get_text(strip=True)
206
+ if text:
207
+ lines.append(text)
208
+
209
+ return "\n".join(lines)
210
 
211
+ @staticmethod
212
+ def _remove_scripts(soup: BeautifulSoup) -> BeautifulSoup:
213
+ for script in soup(["script", "style", "iframe", "noscript"]):
214
+ script.decompose()
215
+ return soup
216
 
217
+ @staticmethod
218
+ def _remove_styles(soup: BeautifulSoup) -> BeautifulSoup:
219
+ for element in soup.find_all(style=True):
220
+ del element['style']
221
+ return soup
222
+
223
+ @staticmethod
224
+ def _remove_special_chars(soup: BeautifulSoup) -> BeautifulSoup:
225
+ text = soup.get_text()
226
+ text = re.sub(r'[^\w\s\.\,\!\?\-]', '', text)
227
+ new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
228
+ return new_soup
229
+
230
+ @staticmethod
231
+ def _normalize_whitespace(soup: BeautifulSoup) -> BeautifulSoup:
232
+ text = soup.get_text()
233
+ text = re.sub(r'\s+', ' ', text)
234
+ new_soup = BeautifulSoup(f"<div>{text}</div>", 'html.parser')
235
+ return new_soup
236
+
237
+ @staticmethod
238
+ def _get_title(soup: BeautifulSoup) -> str:
239
+ title = soup.find('title')
240
+ return title.get_text(strip=True) if title else ""
241
+
242
+ @staticmethod
243
+ def _get_meta_description(soup: BeautifulSoup) -> str:
244
+ meta = soup.find('meta', attrs={'name': 'description'})
245
+ return meta.get('content', '') if meta else ""
246
+
247
+ @staticmethod
248
+ def _get_meta_keywords(soup: BeautifulSoup) -> str:
249
+ meta = soup.find('meta', attrs={'name': 'keywords'})
250
+ return meta.get('content', '') if meta else ""
251
 
252
+ class ContentProcessor:
253
+ """Main content processing orchestrator"""
254
+
255
+ def __init__(self):
256
+ self.url_processor = AdvancedURLProcessor()
257
+ self.content_extractor = ContentExtractor()
258
+ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
259
+
260
+ async def process_urls(self, urls: List[str]) -> Dict[str, Any]:
261
+ """Process multiple URLs concurrently with advanced error handling"""
262
+ results = {
263
+ "successful": [],
264
+ "failed": [],
265
+ "metadata": {
266
+ "total_urls": len(urls),
267
+ "start_time": time.strftime("%Y-%m-%d %H:%M:%S")
268
+ }
269
+ }
270
+
271
  try:
272
+ # Validate URLs first
273
+ valid_urls = []
274
+ for url in urls:
275
+ validation_result = self.url_processor.validate_url(url)
276
+ if validation_result['is_valid']:
277
+ valid_urls.append(url)
 
 
278
  else:
279
+ results['failed'].append({
280
+ "url": url,
281
+ "error": validation_result['message']
282
+ })
283
+
284
+ # Process valid URLs concurrently
285
+ futures = []
286
+ for url in valid_urls:
287
+ future = self.executor.submit(self._process_single_url, url)
288
+ futures.append((url, future))
289
+
290
+ # Collect results
291
+ for url, future in futures:
292
+ try:
293
+ result = future.result(timeout=30) # 30-second timeout
294
+ if result["success"]:
295
+ results["successful"].append(result)
296
+ else:
297
+ results["failed"].append({
298
+ "url": url,
299
+ "error": "Processing failed"
300
+ })
301
+ except Exception as e:
302
+ logger.error(f"Error processing {url}: {e}")
303
+ results["failed"].append({
304
+ "url": url,
305
+ "error": str(e)
306
+ })
307
+
308
+ # Update metadata
309
+ results["metadata"].update({
310
+ "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
311
+ "successful_count": len(results["successful"]),
312
+ "failed_count": len(results["failed"])
313
+ })
314
+
315
+ return results
316
+
317
  except Exception as e:
318
+ logger.error(f"Batch processing error: {e}")
319
+ raise
320
 
321
+ def _process_single_url(self, url: str) -> Dict:
322
+ """Process a single URL with comprehensive error handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  try:
324
+ response = self.url_processor.session.get(
325
+ url,
326
+ timeout=self.url_processor.timeout
327
+ )
328
+ response.raise_for_status()
329
+
330
+ result = self.content_extractor.extract_text(
331
+ response.text,
332
+ url
333
+ )
334
+
335
+ result["url"] = url
336
+ result["timestamp"] = time.strftime("%Y-%m-%d %H:%M:%S")
337
+
338
+ return result
339
+
340
  except Exception as e:
341
+ logger.error(f"Error processing {url}: {e}")
342
+ return {
343
+ "success": False,
344
+ "url": url,
345
+ "error": str(e)
346
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
  def create_interface():
349
+ """Create Gradio interface with advanced features"""
350
+ processor = ContentProcessor()
351
 
352
+ with gr.Blocks(title="Advanced URL Content Processor") as interface:
353
+ gr.Markdown("# Advanced URL Content Processor")
 
 
 
 
 
 
 
 
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  with gr.Row():
356
+ with gr.Column():
357
+ url_input = gr.Textbox(
358
+ label="Enter URLs (one per line)",
359
+ placeholder="https://example.com\nhttps://example.org",
360
+ lines=5
361
+ )
 
 
 
 
 
362
 
363
+ with gr.Row():
364
+ process_btn = gr.Button("Process URLs", variant="primary")
365
+ clear_btn = gr.Button("Clear")
366
+
367
+ with gr.Column():
368
+ status_output = gr.JSON(
369
+ label="Processing Results",
370
+ show_label=True
371
+ )
372
+
373
+ gr.Markdown("## Processing Status")
374
+ with gr.Row():
375
+ progress_output = gr.Textbox(
376
+ label="Progress",
377
+ show_label=True
378
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
 
380
+ async def process_urls(urls):
381
+ if not urls.strip():
382
+ return {"error": "No URLs provided"}
383
+
384
+ url_list = [url.strip() for url in urls.splitlines() if url.strip()]
385
+ results = await processor.process_urls(url_list)
386
+ return results
387
 
388
+ def clear_inputs():
389
+ return None, None
 
390
 
391
+ process_btn.click(
392
+ fn=process_urls,
393
+ inputs=[url_input],
394
+ outputs=[status_output]
395
+ )
396
 
397
+ clear_btn.click(
398
+ fn=clear_inputs,
399
+ inputs=[],
400
+ outputs=[url_input, status_output]
401
+ )
402
+
403
  return interface
404
 
405
  if __name__ == "__main__":
406
  # Initialize mimetypes
407
  mimetypes.init()
408
 
409
+ # Create and launch interface
410
  interface = create_interface()
411
  interface.launch(
 
412
  server_name="0.0.0.0",
413
  server_port=7860,
414
+ share=False,
415
  debug=True
416
+ )