acecalisto3 commited on
Commit
6e67e3a
·
verified ·
1 Parent(s): 91a333c

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +519 -0
app2.py CHANGED
@@ -1,3 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  return []
2
  with tempfile.TemporaryDirectory() as temp_dir:
3
  if zipfile.is_zipfile(file.name):
 
1
+ import json
2
+ import sys
3
+ import os
4
+ import re
5
+ import time
6
+ import logging
7
+ import mimetypes
8
+ import tempfile
9
+ from datetime import datetime
10
+ from pathlib import Path
11
+ from urllib.parse import urlparse
12
+ from typing import List, Dict, Tuple, Union, Optional
13
+ import requests
14
+ import validators
15
+ import gradio as gr
16
+ from bs4 import BeautifulSoup
17
+ from fake_useragent import UserAgent
18
+ from cleantext import clean
19
+ import qrcode
20
+ import zipfile
21
+
22
+ # Setup logging with detailed configuration
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
26
+ handlers=[
27
+ logging.StreamHandler(),
28
+ logging.FileHandler('app.log', encoding='utf-8')
29
+ ])
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class URLProcessor:
34
+ def __init__(self):
35
+ self.session = requests.Session()
36
+ self.timeout = 10 # seconds
37
+ self.session.headers.update({
38
+ 'User-Agent': UserAgent().random,
39
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
40
+ 'Accept-Language': 'en-US,en;q=0.5',
41
+ 'Accept-Encoding': 'gzip, deflate, br',
42
+ 'Connection': 'keep-alive',
43
+ 'Upgrade-Insecure-Requests': '1'
44
+ })
45
+
46
+ def advanced_text_cleaning(self, text: str) -> str:
47
+ """Robust text cleaning with version compatibility"""
48
+ try:
49
+ cleaned_text = clean(
50
+ text,
51
+ to_ascii=True,
52
+ lower=True,
53
+ no_line_breaks=True,
54
+ no_urls=True,
55
+ no_emails=True,
56
+ no_phone_numbers=True,
57
+ no_numbers=False,
58
+ no_digits=False,
59
+ no_currency_symbols=True,
60
+ no_punct=False
61
+ ).strip()
62
+ return cleaned_text
63
+ except Exception as e:
64
+ logger.warning(f"Text cleaning error: {e}. Using fallback method.")
65
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
66
+ text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
67
+ text = re.sub(r'\s+', ' ', text) # Normalize whitespace
68
+ return text.strip()
69
+
70
+ def validate_url(self, url: str) -> Dict:
71
+ """Validate URL format and accessibility"""
72
+ try:
73
+ if not validators.url(url):
74
+ return {'is_valid': False, 'message': 'Invalid URL format'}
75
+ # Try with DNS resolution retry
76
+ for attempt in range(3): # Try up to 3 times
77
+ try:
78
+ # Some sites block HEAD requests but allow GET
79
+ try:
80
+ response = self.session.head(url, timeout=self.timeout)
81
+ response.raise_for_status()
82
+ except (requests.exceptions.RequestException, Exception) as e:
83
+ logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
84
+ # Try with GET request if HEAD fails
85
+ response = self.session.get(url, timeout=self.timeout, stream=True)
86
+ response.raise_for_status()
87
+ # Close the connection to avoid downloading the entire content
88
+ response.close()
89
+
90
+ return {'is_valid': True, 'message': 'URL is valid and accessible'}
91
+ except requests.exceptions.ConnectionError as e:
92
+ if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
93
+ logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
94
+ time.sleep(1) # Wait a bit before retrying
95
+ continue
96
+ else:
97
+ raise
98
+ except Exception as e:
99
+ raise
100
+ # If we get here, all attempts failed
101
+ return {'is_valid': False,
102
+ 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
103
+ except Exception as e:
104
+ logger.error(f"URL validation failed for {url}: {str(e)}")
105
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
106
+
107
+ def fetch_content(self, url: str) -> Optional[Dict]:
108
+ """Universal content fetcher with special case handling"""
109
+ try:
110
+ logger.info(f"Fetching content from: {url}")
111
+
112
+ # Google Drive document handling
113
+ if 'drive.google.com' in url:
114
+ return self._handle_google_drive(url)
115
+ # Google Calendar ICS handling
116
+ if 'calendar.google.com' in url and 'ical' in url:
117
+ return self._handle_google_calendar(url)
118
+ # Try standard HTML processing first
119
+ result = self._fetch_html_content(url)
120
+
121
+ # If standard processing failed or returned minimal content, try with Selenium
122
+ if not result or len(result.get('content', '')) < 100:
123
+ logger.info(
124
+ f"Standard processing failed or returned minimal content for {url}, trying Selenium")
125
+ selenium_html = self._fetch_with_selenium(url)
126
+ if selenium_html:
127
+ # Process the Selenium HTML
128
+ soup = BeautifulSoup(selenium_html, 'html.parser')
129
+ # Remove unwanted elements
130
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
131
+ element.decompose()
132
+
133
+ # Apply the same content extraction strategies as in _fetch_html_content
134
+ # Strategy 1: Look for semantic HTML5 elements
135
+ main_content = None
136
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
137
+ '.entry', '.page']:
138
+ elements = soup.select(selector)
139
+ if elements:
140
+ main_content = elements[0]
141
+ logger.info(f"Found content with selector: {selector}")
142
+ break
143
+
144
+ # If no main content found, use body
145
+ if not main_content or not main_content.get_text(strip=True):
146
+ main_content = soup.body if soup.body else soup
147
+
148
+ # Extract text
149
+ text_content = main_content.get_text(separator='\n', strip=True)
150
+
151
+ # Clean content
152
+ cleaned_content = self.advanced_text_cleaning(text_content)
153
+
154
+ if len(cleaned_content) >= 20:
155
+ result = {
156
+ 'content': cleaned_content,
157
+ 'content_type': 'text/html',
158
+ 'timestamp': datetime.now().isoformat(),
159
+ 'url': url,
160
+ 'source': 'selenium' # Mark that this came from Selenium
161
+ }
162
+ # Log the result status
163
+ if result:
164
+ logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
165
+ else:
166
+ logger.error(f"Failed to extract content from {url}")
167
+ return result
168
+ except Exception as e:
169
+ logger.error(f"Content fetch failed for {url}: {e}")
170
+ return None
171
+
172
+ def _fetch_html_content(self, url: str) -> Optional[Dict]:
173
+ """Standard HTML content processing"""
174
+ try:
175
+ # Try with a different user agent if it's a social media site
176
+ if any(domain in url for domain in
177
+ ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
178
+ # Use a more realistic browser user agent instead of random one
179
+ self.session.headers.update({
180
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
181
+ # Add cookie consent headers to bypass some login walls
182
+ 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
183
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
184
+ 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
185
+ 'sec-ch-ua-mobile': '?0',
186
+ 'sec-ch-ua-platform': '"macOS"',
187
+ 'Sec-Fetch-Dest': 'document',
188
+ 'Sec-Fetch-Mode': 'navigate',
189
+ 'Sec-Fetch-Site': 'none',
190
+ 'Sec-Fetch-User': '?1',
191
+ 'Upgrade-Insecure-Requests': '1'
192
+ })
193
+ # For Facebook, try to access the mobile version which often has fewer restrictions
194
+ if 'facebook.com' in url and 'm.facebook.com' not in url:
195
+ url = url.replace('www.facebook.com', 'm.facebook.com')
196
+ logger.info(f"Switched to mobile Facebook URL: {url}")
197
+
198
+ # Add a delay to simulate human browsing
199
+ time.sleep(1)
200
+
201
+ # Try to get the page with multiple attempts
202
+ max_attempts = 3
203
+ for attempt in range(max_attempts):
204
+ try:
205
+ response = self.session.get(url, timeout=self.timeout)
206
+ response.raise_for_status()
207
+ break
208
+ except (requests.exceptions.RequestException, Exception) as e:
209
+ if attempt < max_attempts - 1:
210
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
211
+ time.sleep(2) # Wait longer between retries
212
+ else:
213
+ raise
214
+ logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
215
+
216
+ # Save the raw HTML for debugging if needed
217
+ debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
218
+ with open(debug_path, "w", encoding="utf-8") as f:
219
+ f.write(response.text)
220
+ logger.info(f"Saved raw HTML to {debug_path}")
221
+
222
+ # Check if we got a valid response with content
223
+ if not response.text or len(response.text) < 100:
224
+ logger.error(f"Empty or very short response from {url}")
225
+ return None
226
+
227
+ soup = BeautifulSoup(response.text, 'html.parser')
228
+ # Remove unwanted elements
229
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
230
+ element.decompose()
231
+
232
+ # Simulate "ESC key" by removing login walls and overlays common on social media sites
233
+ login_wall_selectors = [
234
+ '.login-wall', '.signup-wall', '.overlay', '.modal',
235
+ '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
236
+ '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
237
+ '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
238
+ '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
239
+ '.login-wrapper', '.login-container', '.signup-container',
240
+ '.login-modal', '.signup-modal', '.auth-modal', '.auth-wall'
241
+ ]
242
+ for selector in login_wall_selectors:
243
+ for element in soup.select(selector):
244
+ logger.info(f"Removing login wall element: {selector}")
245
+ element.decompose()
246
+
247
+ # Enhanced removal for social media sites
248
+ if 'facebook.com' in url:
249
+ # Facebook specific elements - simulating ESC key
250
+ fb_selectors = [
251
+ '[data-testid="cookie-policy-manage-dialog"]',
252
+ '[role="banner"]', '[role="complementary"]',
253
+ '.login_form_container', '.login_form', '#login_form',
254
+ '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
255
+ '._5hn6', '._67m7', '.nonLoggedInSignUp',
256
+ '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
257
+ ]
258
+ for selector in fb_selectors:
259
+ for element in soup.select(selector):
260
+ element.decompose()
261
+
262
+ # Look for the main content in mobile version
263
+ main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
264
+ '#mobile_injected_video_feed_pagelet')
265
+ if main_content:
266
+ logger.info("Found Facebook mobile main content")
267
+
268
+ elif 'instagram.com' in url:
269
+ # Instagram specific elements - simulating ESC key
270
+ ig_selectors = [
271
+ '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
272
+ '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
273
+ '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
274
+ '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
275
+ '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
276
+ ]
277
+ for selector in ig_selectors:
278
+ for element in soup.select(selector):
279
+ element.decompose()
280
+
281
+ # Try to find the main content
282
+ insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
283
+ '._ab1y')
284
+ if insta_content:
285
+ logger.info("Found Instagram main content")
286
+
287
+ elif 'twitter.com' in url or 'x.com' in url:
288
+ # X/Twitter already works well for public content, but clean up any remaining overlays
289
+ x_selectors = [
290
+ '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
291
+ '[data-testid="sheetDialog"]', '[data-testid="mask"]',
292
+ '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
293
+ '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
294
+ '.r-kemksi', '[data-testid="BottomBar"]'
295
+ ]
296
+ for selector in x_selectors:
297
+ for element in soup.select(selector):
298
+ element.decompose()
299
+
300
+ elif 'huggingface.co' in url:
301
+ # Special handling for Hugging Face
302
+ logger.info("Applying special handling for Hugging Face")
303
+ # Try to find the main content
304
+ hf_selectors = ['.prose', '.space-content', '.model-description',
305
+ '.dataset-description', 'article', '.markdown']
306
+ for selector in hf_selectors:
307
+ elements = soup.select(selector)
308
+ if elements:
309
+ logger.info(f"Found Hugging Face content with selector: {selector}")
310
+ break
311
+ # Extract content using a general approach - try multiple strategies
312
+ # Strategy 1: Look for semantic HTML5 elements
313
+ main_content = None
314
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
315
+ '.page']:
316
+ elements = soup.select(selector)
317
+ if elements:
318
+ main_content = elements[0]
319
+ logger.info(f"Found content with selector: {selector}")
320
+ break
321
+
322
+ # Strategy 2: If no semantic elements, try common class names
323
+ if not main_content or not main_content.get_text(strip=True):
324
+ for div in soup.find_all('div'):
325
+ class_name = div.get('class', [])
326
+ id_name = div.get('id', '')
327
+ if any(term in ' '.join(class_name).lower() for term in
328
+ ['content', 'main', 'body', 'article', 'post']):
329
+ main_content = div
330
+ logger.info(f"Found content with div class: {class_name}")
331
+ break
332
+ if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
333
+ main_content = div
334
+ logger.info(f"Found content with div id: {id_name}")
335
+ break
336
+
337
+ # Strategy 3: Fall back to body
338
+ if not main_content or not main_content.get_text(strip=True):
339
+ logger.info(f"No main content container found for {url}, using body")
340
+ main_content = soup.body if soup.body else soup
341
+
342
+ # Extract text with proper spacing
343
+ text_content = main_content.get_text(separator='\n', strip=True)
344
+
345
+ # Strategy 4: If content is too short, extract all visible text
346
+ if len(text_content) < 100:
347
+ logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
348
+ visible_text = []
349
+ for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
350
+ if element.get_text(strip=True):
351
+ visible_text.append(element.get_text(strip=True))
352
+ text_content = '\n'.join(visible_text)
353
+
354
+ # Strategy 5: Last resort - get all text from the page
355
+ if len(text_content) < 50:
356
+ logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
357
+ text_content = soup.get_text(separator='\n', strip=True)
358
+ # Clean and structure content
359
+ cleaned_content = self.advanced_text_cleaning(text_content)
360
+
361
+ logger.info(f"Final content length: {len(cleaned_content)} chars")
362
+
363
+ # If we still have no content, this is a failure
364
+ if len(cleaned_content) < 20:
365
+ logger.error(f"Failed to extract meaningful content from {url}")
366
+ return None
367
+
368
+ return {
369
+ 'content': cleaned_content,
370
+ 'content_type': response.headers.get('Content-Type', ''),
371
+ 'timestamp': datetime.now().isoformat(),
372
+ 'url': url # Add the URL to the returned data for reference
373
+ }
374
+ except Exception as e:
375
+ logger.error(f"HTML processing failed for {url}: {e}")
376
+ return None
377
+
378
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
379
+ """Handle Google Drive document URLs"""
380
+ try:
381
+ # Construct direct download URL
382
+ file_id = url.split("/d/")[1].split("/")[0]
383
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
384
+ response = self.session.get(download_url, stream=True, timeout=self.timeout)
385
+ response.raise_for_status()
386
+
387
+ # Read content (limit to the first 1MB)
388
+ content = b""
389
+ for chunk in response.iter_content(chunk_size=8192): # 8KB chunks
390
+ content += chunk
391
+ if len(content) > 1024 * 1024: # 1MB limit
392
+ content = content[:1024 * 1024]
393
+ logger.warning(f"Truncated Google Drive file after 1MB")
394
+ break
395
+ text_content = content.decode('utf-8', errors='ignore')
396
+ cleaned_text = self.advanced_text_cleaning(text_content)
397
+
398
+ return {
399
+ 'content': cleaned_text,
400
+ 'content_type': 'text/plain', # Assume plain text for simplicity
401
+ 'timestamp': datetime.now().isoformat(),
402
+ 'url': url,
403
+ 'source': 'google_drive'
404
+ }
405
+ except Exception as e:
406
+ logger.error(f"Error handling Google Drive URL {url}: {e}")
407
+ return None
408
+
409
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
410
+ """Handle Google Calendar ICS URLs"""
411
+ try:
412
+ response = self.session.get(url, timeout=self.timeout)
413
+ response.raise_for_status()
414
+ text_content = response.text
415
+ cleaned_text = self.advanced_text_cleaning(text_content)
416
+ return {
417
+ 'content': cleaned_text,
418
+ 'content_type': 'text/calendar', # Correct MIME type
419
+ 'timestamp': datetime.now().isoformat(),
420
+ 'url': url,
421
+ 'source': 'google_calendar'
422
+ }
423
+ except Exception as e:
424
+ logger.error(f"Error handling Google Calendar URL {url}: {e}")
425
+ return None
426
+
427
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
428
+ """Use Selenium as a fallback for difficult sites"""
429
+ try:
430
+ from selenium import webdriver
431
+ from selenium.webdriver.chrome.options import Options
432
+ from selenium.webdriver.common.by import By
433
+ from selenium.webdriver.support.ui import WebDriverWait
434
+ from selenium.webdriver.support import expected_conditions as EC
435
+ from selenium.common.exceptions import TimeoutException
436
+ import time
437
+
438
+ logger.info(f"Attempting to fetch {url} with Selenium")
439
+
440
+ # Set up Chrome options
441
+ chrome_options = Options()
442
+ chrome_options.add_argument("--headless")
443
+ chrome_options.add_argument("--no-sandbox")
444
+ chrome_options.add_argument("--disable-dev-shm-usage")
445
+ chrome_options.add_argument("--disable-gpu")
446
+ chrome_options.add_argument("--window-size=1920,1080")
447
+ chrome_options.add_argument(
448
+ "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
449
+
450
+ # Initialize the driver
451
+ driver = webdriver.Chrome(options=chrome_options)
452
+
453
+ try:
454
+ # Navigate to the URL
455
+ driver.get(url)
456
+
457
+ # Wait for the page to load
458
+ WebDriverWait(driver, 10).until(
459
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
460
+ )
461
+
462
+ # Simulate pressing ESC key to dismiss overlays
463
+ from selenium.webdriver.common.keys import Keys
464
+ action_chains = webdriver.ActionChains(driver)
465
+ action_chains.send_keys(Keys.ESCAPE).perform()
466
+ time.sleep(1) # give it a moment to take effect
467
+ action_chains.reset_actions() # Clear actions
468
+
469
+ # try again
470
+ action_chains.send_keys(Keys.ESCAPE).perform()
471
+ time.sleep(1) # give it a moment to take effect
472
+ action_chains.reset_actions()
473
+
474
+ # Get the page source
475
+ page_source = driver.page_source
476
+
477
+ # Save the Selenium HTML for debugging
478
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
479
+ with open(debug_path, "w", encoding="utf-8") as f:
480
+ f.write(page_source)
481
+ logger.info(f"Saved Selenium HTML to {debug_path}")
482
+
483
+ return page_source
484
+ finally:
485
+ driver.quit()
486
+
487
+ except ImportError:
488
+ logger.error("Selenium is not installed. Cannot use browser automation.")
489
+ return None
490
+ except Exception as e:
491
+ logger.error(f"Selenium processing failed for {url}: {e}")
492
+ return None
493
+
494
+
495
+ class FileProcessor:
496
+ """Class to handle file processing"""
497
+
498
+ def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
499
+ self.max_file_size = max_file_size
500
+ self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
501
+
502
+ def is_text_file(self, filepath: str) -> bool:
503
+ """Check if file is a text file"""
504
+ try:
505
+ mime_type, _ = mimetypes.guess_type(filepath)
506
+ return (mime_type and mime_type.startswith('text/')) or \
507
+ (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
508
+ except Exception:
509
+ return False
510
+
511
+ def process_file(self, file) -> List[Dict]:
512
+ """Process uploaded file with enhanced error handling"""
513
+ if not file:
514
+ return []
515
+ dataset = []
516
+ try:
517
+ file_size = os.path.getsize(file.name)
518
+ if file_size > self.max_file_size:
519
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
520
  return []
521
  with tempfile.TemporaryDirectory() as temp_dir:
522
  if zipfile.is_zipfile(file.name):