acecalisto3 commited on
Commit
76eedaa
·
verified ·
1 Parent(s): 66892b0

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +213 -663
app2.py CHANGED
@@ -30,659 +30,146 @@ logging.basicConfig(
30
  logger = logging.getLogger(__name__)
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  class URLProcessor:
34
  def __init__(self):
35
- self.session = requests.Session()
36
- self.timeout = 10 # seconds
37
- self.session.headers.update({
38
- 'User-Agent': UserAgent().random,
39
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
40
- 'Accept-Language': 'en-US,en;q=0.5',
41
- 'Accept-Encoding': 'gzip, deflate, br',
42
- 'Connection': 'keep-alive',
43
- 'Upgrade-Insecure-Requests': '1'
44
- })
45
-
46
- def advanced_text_cleaning(self, text: str) -> str:
47
- """Robust text cleaning with version compatibility"""
48
- try:
49
- cleaned_text = clean(
50
- text,
51
- to_ascii=True,
52
- lower=True,
53
- no_line_breaks=True,
54
- no_urls=True,
55
- no_emails=True,
56
- no_phone_numbers=True,
57
- no_numbers=False,
58
- no_digits=False,
59
- no_currency_symbols=True,
60
- no_punct=False
61
- ).strip()
62
- return cleaned_text
63
- except Exception as e:
64
- logger.warning(f"Text cleaning error: {e}. Using fallback method.")
65
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text) # Remove control characters
66
- text = text.encode('ascii', 'ignore').decode('ascii') # Remove non-ASCII characters
67
- text = re.sub(r'\s+', ' ', text) # Normalize whitespace
68
- return text.strip()
69
-
70
- def validate_url(self, url: str) -> Dict:
71
- """Validate URL format and accessibility"""
72
- try:
73
- if not validators.url(url):
74
- return {'is_valid': False, 'message': 'Invalid URL format'}
75
- # Try with DNS resolution retry
76
- for attempt in range(3): # Try up to 3 times
77
- try:
78
- # Some sites block HEAD requests but allow GET
79
- try:
80
- response = self.session.head(url, timeout=self.timeout)
81
- response.raise_for_status()
82
- except (requests.exceptions.RequestException, Exception) as e:
83
- logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
84
- # Try with GET request if HEAD fails
85
- response = self.session.get(url, timeout=self.timeout, stream=True)
86
- response.raise_for_status()
87
- # Close the connection to avoid downloading the entire content
88
- response.close()
89
-
90
- return {'is_valid': True, 'message': 'URL is valid and accessible'}
91
- except requests.exceptions.ConnectionError as e:
92
- if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
93
- logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
94
- time.sleep(1) # Wait a bit before retrying
95
- continue
96
- else:
97
- raise
98
- except Exception as e:
99
- raise
100
- # If we get here, all attempts failed
101
- return {'is_valid': False,
102
- 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
103
- except Exception as e:
104
- logger.error(f"URL validation failed for {url}: {str(e)}")
105
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
106
-
107
  def fetch_content(self, url: str) -> Optional[Dict]:
108
- """Universal content fetcher with special case handling"""
109
- try:
110
- logger.info(f"Fetching content from: {url}")
111
-
112
- # Google Drive document handling
113
- if 'drive.google.com' in url:
114
- return self._handle_google_drive(url)
115
- # Google Calendar ICS handling
116
- if 'calendar.google.com' in url and 'ical' in url:
117
- return self._handle_google_calendar(url)
118
- # Try standard HTML processing first
119
- result = self._fetch_html_content(url)
120
-
121
- # If standard processing failed or returned minimal content, try with Selenium
122
- if not result or len(result.get('content', '')) < 100:
123
- logger.info(
124
- f"Standard processing failed or returned minimal content for {url}, trying Selenium")
125
- selenium_html = self._fetch_with_selenium(url)
126
- if selenium_html:
127
- # Process the Selenium HTML
128
- soup = BeautifulSoup(selenium_html, 'html.parser')
129
- # Remove unwanted elements
130
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
131
- element.decompose()
132
-
133
- # Apply the same content extraction strategies as in _fetch_html_content
134
- # Strategy 1: Look for semantic HTML5 elements
135
- main_content = None
136
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
137
- '.entry', '.page']:
138
- elements = soup.select(selector)
139
- if elements:
140
- main_content = elements[0]
141
- logger.info(f"Found content with selector: {selector}")
142
- break
143
-
144
- # If no main content found, use body
145
- if not main_content or not main_content.get_text(strip=True):
146
- main_content = soup.body if soup.body else soup
147
-
148
- # Extract text
149
- text_content = main_content.get_text(separator='\n', strip=True)
150
-
151
- # Clean content
152
- cleaned_content = self.advanced_text_cleaning(text_content)
153
-
154
- if len(cleaned_content) >= 20:
155
- result = {
156
- 'content': cleaned_content,
157
- 'content_type': 'text/html',
158
- 'timestamp': datetime.now().isoformat(),
159
- 'url': url,
160
- 'source': 'selenium' # Mark that this came from Selenium
161
- }
162
- # Log the result status
163
- if result:
164
- logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
165
- else:
166
- logger.error(f"Failed to extract content from {url}")
167
- return result
168
- except Exception as e:
169
- logger.error(f"Content fetch failed for {url}: {e}")
170
- return None
171
-
172
- def _fetch_html_content(self, url: str) -> Optional[Dict]:
173
- """Standard HTML content processing"""
174
- try:
175
- # Try with a different user agent if it's a social media site
176
- if any(domain in url for domain in
177
- ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
178
- # Use a more realistic browser user agent instead of random one
179
- self.session.headers.update({
180
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
181
- # Add cookie consent headers to bypass some login walls
182
- 'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
183
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
184
- 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
185
- 'sec-ch-ua-mobile': '?0',
186
- 'sec-ch-ua-platform': '"macOS"',
187
- 'Sec-Fetch-Dest': 'document',
188
- 'Sec-Fetch-Mode': 'navigate',
189
- 'Sec-Fetch-Site': 'none',
190
- 'Sec-Fetch-User': '?1',
191
- 'Upgrade-Insecure-Requests': '1'
192
- })
193
- # For Facebook, try to access the mobile version which often has fewer restrictions
194
- if 'facebook.com' in url and 'm.facebook.com' not in url:
195
- url = url.replace('www.facebook.com', 'm.facebook.com')
196
- logger.info(f"Switched to mobile Facebook URL: {url}")
197
-
198
- # Add a delay to simulate human browsing
199
- time.sleep(1)
200
-
201
- # Try to get the page with multiple attempts
202
- max_attempts = 3
203
- for attempt in range(max_attempts):
204
  try:
205
- response = self.session.get(url, timeout=self.timeout)
206
- response.raise_for_status()
207
- break
208
- except (requests.exceptions.RequestException, Exception) as e:
209
- if attempt < max_attempts - 1:
210
- logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
211
- time.sleep(2) # Wait longer between retries
212
- else:
213
- raise
214
- logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
215
-
216
- # Save the raw HTML for debugging if needed
217
- debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
218
- with open(debug_path, "w", encoding="utf-8") as f:
219
- f.write(response.text)
220
- logger.info(f"Saved raw HTML to {debug_path}")
221
-
222
- # Check if we got a valid response with content
223
- if not response.text or len(response.text) < 100:
224
- logger.error(f"Empty or very short response from {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  return None
226
-
227
- soup = BeautifulSoup(response.text, 'html.parser')
228
- # Remove unwanted elements
229
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
230
- element.decompose()
231
-
232
- # Simulate "ESC key" by removing login walls and overlays common on social media sites
233
- login_wall_selectors = [
234
- '.login-wall', '.signup-wall', '.overlay', '.modal',
235
- '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
236
- '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
237
- '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
238
- '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
239
- '.login-wrapper', '.login-container', '.signup-container',
240
- '.login-modal', '.signup-modal', '.auth-modal', '.auth-wall'
241
- ]
242
- for selector in login_wall_selectors:
243
- for element in soup.select(selector):
244
- logger.info(f"Removing login wall element: {selector}")
245
- element.decompose()
246
-
247
- # Enhanced removal for social media sites
248
- if 'facebook.com' in url:
249
- # Facebook specific elements - simulating ESC key
250
- fb_selectors = [
251
- '[data-testid="cookie-policy-manage-dialog"]',
252
- '[role="banner"]', '[role="complementary"]',
253
- '.login_form_container', '.login_form', '#login_form',
254
- '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
255
- '._5hn6', '._67m7', '.nonLoggedInSignUp',
256
- '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
257
- ]
258
- for selector in fb_selectors:
259
- for element in soup.select(selector):
260
- element.decompose()
261
-
262
- # Look for the main content in mobile version
263
- main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
264
- '#mobile_injected_video_feed_pagelet')
265
- if main_content:
266
- logger.info("Found Facebook mobile main content")
267
-
268
- elif 'instagram.com' in url:
269
- # Instagram specific elements - simulating ESC key
270
- ig_selectors = [
271
- '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
272
- '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
273
- '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
274
- '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
275
- '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
276
- ]
277
- for selector in ig_selectors:
278
- for element in soup.select(selector):
279
- element.decompose()
280
-
281
- # Try to find the main content
282
- insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
283
- '._ab1y')
284
- if insta_content:
285
- logger.info("Found Instagram main content")
286
-
287
- elif 'twitter.com' in url or 'x.com' in url:
288
- # X/Twitter already works well for public content, but clean up any remaining overlays
289
- x_selectors = [
290
- '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
291
- '[data-testid="sheetDialog"]', '[data-testid="mask"]',
292
- '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
293
- '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
294
- '.r-kemksi', '[data-testid="BottomBar"]'
295
- ]
296
- for selector in x_selectors:
297
- for element in soup.select(selector):
298
- element.decompose()
299
-
300
- elif 'huggingface.co' in url:
301
- # Special handling for Hugging Face
302
- logger.info("Applying special handling for Hugging Face")
303
- # Try to find the main content
304
- hf_selectors = ['.prose', '.space-content', '.model-description',
305
- '.dataset-description', 'article', '.markdown']
306
- for selector in hf_selectors:
307
- elements = soup.select(selector)
308
- if elements:
309
- logger.info(f"Found Hugging Face content with selector: {selector}")
310
- break
311
- # Extract content using a general approach - try multiple strategies
312
- # Strategy 1: Look for semantic HTML5 elements
313
- main_content = None
314
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
315
- '.page']:
316
- elements = soup.select(selector)
317
- if elements:
318
- main_content = elements[0]
319
- logger.info(f"Found content with selector: {selector}")
320
- break
321
-
322
- # Strategy 2: If no semantic elements, try common class names
323
- if not main_content or not main_content.get_text(strip=True):
324
- for div in soup.find_all('div'):
325
- class_name = div.get('class', [])
326
- id_name = div.get('id', '')
327
- if any(term in ' '.join(class_name).lower() for term in
328
- ['content', 'main', 'body', 'article', 'post']):
329
- main_content = div
330
- logger.info(f"Found content with div class: {class_name}")
331
- break
332
- if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
333
- main_content = div
334
- logger.info(f"Found content with div id: {id_name}")
335
- break
336
-
337
- # Strategy 3: Fall back to body
338
- if not main_content or not main_content.get_text(strip=True):
339
- logger.info(f"No main content container found for {url}, using body")
340
- main_content = soup.body if soup.body else soup
341
-
342
- # Extract text with proper spacing
343
- text_content = main_content.get_text(separator='\n', strip=True)
344
-
345
- # Strategy 4: If content is too short, extract all visible text
346
- if len(text_content) < 100:
347
- logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
348
- visible_text = []
349
- for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
350
- if element.get_text(strip=True):
351
- visible_text.append(element.get_text(strip=True))
352
- text_content = '\n'.join(visible_text)
353
-
354
- # Strategy 5: Last resort - get all text from the page
355
- if len(text_content) < 50:
356
- logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
357
- text_content = soup.get_text(separator='\n', strip=True)
358
- # Clean and structure content
359
- cleaned_content = self.advanced_text_cleaning(text_content)
360
-
361
- logger.info(f"Final content length: {len(cleaned_content)} chars")
362
-
363
- # If we still have no content, this is a failure
364
- if len(cleaned_content) < 20:
365
- logger.error(f"Failed to extract meaningful content from {url}")
366
  return None
367
-
368
- return {
369
- 'content': cleaned_content,
370
- 'content_type': response.headers.get('Content-Type', ''),
371
- 'timestamp': datetime.now().isoformat(),
372
- 'url': url # Add the URL to the returned data for reference
373
- }
374
- except Exception as e:
375
- logger.error(f"HTML processing failed for {url}: {e}")
376
- return None
377
-
378
- def _handle_google_drive(self, url: str) -> Optional[Dict]:
379
- """Handle Google Drive document URLs"""
380
- try:
381
- # Construct direct download URL
382
- file_id = url.split("/d/")[1].split("/")[0]
383
- download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
384
- response = self.session.get(download_url, stream=True, timeout=self.timeout)
385
- response.raise_for_status()
386
-
387
- # Read content (limit to the first 1MB)
388
- content = b""
389
- for chunk in response.iter_content(chunk_size=8192): # 8KB chunks
390
- content += chunk
391
- if len(content) > 1024 * 1024: # 1MB limit
392
- content = content[:1024 * 1024]
393
- logger.warning(f"Truncated Google Drive file after 1MB")
394
- break
395
- text_content = content.decode('utf-8', errors='ignore')
396
- cleaned_text = self.advanced_text_cleaning(text_content)
397
-
398
- return {
399
- 'content': cleaned_text,
400
- 'content_type': 'text/plain', # Assume plain text for simplicity
401
- 'timestamp': datetime.now().isoformat(),
402
- 'url': url,
403
- 'source': 'google_drive'
404
- }
405
- except Exception as e:
406
- logger.error(f"Error handling Google Drive URL {url}: {e}")
407
- return None
408
-
409
- def _handle_google_calendar(self, url: str) -> Optional[Dict]:
410
- """Handle Google Calendar ICS URLs"""
411
- try:
412
- response = self.session.get(url, timeout=self.timeout)
413
- response.raise_for_status()
414
- text_content = response.text
415
- cleaned_text = self.advanced_text_cleaning(text_content)
416
- return {
417
- 'content': cleaned_text,
418
- 'content_type': 'text/calendar', # Correct MIME type
419
- 'timestamp': datetime.now().isoformat(),
420
- 'url': url,
421
- 'source': 'google_calendar'
422
- }
423
- except Exception as e:
424
- logger.error(f"Error handling Google Calendar URL {url}: {e}")
425
- return None
426
-
427
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
428
- """Use Selenium as a fallback for difficult sites"""
429
- try:
430
- from selenium import webdriver
431
- from selenium.webdriver.chrome.options import Options
432
- from selenium.webdriver.common.by import By
433
- from selenium.webdriver.support.ui import WebDriverWait
434
- from selenium.webdriver.support import expected_conditions as EC
435
- from selenium.common.exceptions import TimeoutException
436
- import time
437
-
438
- logger.info(f"Attempting to fetch {url} with Selenium")
439
-
440
- # Set up Chrome options
441
- chrome_options = Options()
442
- chrome_options.add_argument("--headless")
443
- chrome_options.add_argument("--no-sandbox")
444
- chrome_options.add_argument("--disable-dev-shm-usage")
445
- chrome_options.add_argument("--disable-gpu")
446
- chrome_options.add_argument("--window-size=1920,1080")
447
- chrome_options.add_argument(
448
- "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
449
-
450
- # Initialize the driver
451
- driver = webdriver.Chrome(options=chrome_options)
452
-
453
- try:
454
- # Navigate to the URL
455
- driver.get(url)
456
-
457
- # Wait for the page to load
458
- WebDriverWait(driver, 10).until(
459
- EC.presence_of_element_located((By.TAG_NAME, "body"))
460
- )
461
-
462
- # Simulate pressing ESC key to dismiss overlays
463
- from selenium.webdriver.common.keys import Keys
464
- action_chains = webdriver.ActionChains(driver)
465
- action_chains.send_keys(Keys.ESCAPE).perform()
466
- time.sleep(1) # give it a moment to take effect
467
- action_chains.reset_actions() # Clear actions
468
-
469
- # try again
470
- action_chains.send_keys(Keys.ESCAPE).perform()
471
- time.sleep(1) # give it a moment to take effect
472
- action_chains.reset_actions()
473
-
474
- # Get the page source
475
- page_source = driver.page_source
476
-
477
- # Save the Selenium HTML for debugging
478
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
479
- with open(debug_path, "w", encoding="utf-8") as f:
480
- f.write(page_source)
481
- logger.info(f"Saved Selenium HTML to {debug_path}")
482
-
483
- return page_source
484
- finally:
485
- driver.quit()
486
-
487
- except ImportError:
488
- logger.error("Selenium is not installed. Cannot use browser automation.")
489
- return None
490
- except Exception as e:
491
- logger.error(f"Selenium processing failed for {url}: {e}")
492
- return None
493
-
494
-
495
- class FileProcessor:
496
- """Class to handle file processing"""
497
-
498
- def __init__(self, max_file_size: int = 2 * 1024 * 1024 * 1024): # 2GB default
499
- self.max_file_size = max_file_size
500
- self.supported_text_extensions = {'.txt', '.md', '.csv', '.json', '.xml'}
501
-
502
- def is_text_file(self, filepath: str) -> bool:
503
- """Check if file is a text file"""
504
- try:
505
- mime_type, _ = mimetypes.guess_type(filepath)
506
- return (mime_type and mime_type.startswith('text/')) or \
507
- (os.path.splitext(filepath)[1].lower() in self.supported_text_extensions)
508
- except Exception:
509
- return False
510
-
511
- def process_file(self, file) -> List[Dict]:
512
- """Process uploaded file with enhanced error handling"""
513
- if not file:
514
- return []
515
- dataset = []
516
- try:
517
- file_size = os.path.getsize(file.name)
518
- if file_size > self.max_file_size:
519
- logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
520
- return []
521
- with tempfile.TemporaryDirectory() as temp_dir:
522
- if zipfile.is_zipfile(file.name):
523
- dataset.extend(self._process_zip_file(file.name, temp_dir))
524
- else:
525
- dataset.extend(self._process_single_file(file))
526
- except Exception as e:
527
- logger.error(f"Error processing file: {str(e)}")
528
- return []
529
- return dataset
530
-
531
- def _process_zip_file(self, zip_path, temp_dir):
532
- """Extract and process files within a ZIP archive."""
533
- result = []
534
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
535
- zip_ref.extractall(temp_dir)
536
- for extracted_file in os.listdir(temp_dir):
537
- extracted_file_path = os.path.join(temp_dir, extracted_file)
538
- if os.path.isfile(extracted_file_path):
539
- with open(extracted_file_path, 'r', encoding='utf-8', errors='ignore') as f:
540
- result.append({
541
- 'source': 'file_from_zip',
542
- 'filename': extracted_file,
543
- 'content': f.read(),
544
- 'timestamp': datetime.now().isoformat()
545
- })
546
- return result
547
-
548
- def _process_single_file(self, file) -> List[Dict]:
549
- try:
550
- file_stat = os.stat(file.name)
551
- # For very large files, read in chunks and summarize
552
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
553
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
554
- # Read first and last 1MB for extremely large files
555
- content = ""
556
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
557
- content = f.read(1 * 1024 * 1024) # First 1MB
558
- content += "\n...[Content truncated due to large file size]...\n"
559
- # Seek to the last 1MB
560
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
561
- content += f.read() # Last 1MB
562
- else:
563
- # Regular file processing
564
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
565
- content = f.read()
566
- return [{
567
- 'source': 'file',
568
- 'filename': os.path.basename(file.name),
569
- 'file_size': file_stat.st_size,
570
- 'mime_type': mimetypes.guess_type(file.name)[0],
571
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
572
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
573
- 'content': content,
574
- 'timestamp': datetime.now().isoformat()
575
- }]
576
- except Exception as e:
577
- logger.error(f"File processing error: {e}")
578
- return []
579
-
580
-
581
- # Move process_all_inputs outside of the FileProcessor class
582
- def process_all_inputs(urls, file, text, notes):
583
- """Process all input types with progress tracking"""
584
- try:
585
- processor = URLProcessor()
586
- file_processor = FileProcessor()
587
- results = []
588
-
589
- # Process URLs
590
- if urls:
591
- url_list = re.split(r'[,\n]', urls)
592
- url_list = [url.strip() for url in url_list if url.strip()]
593
-
594
- for url in url_list:
595
- validation = processor.validate_url(url)
596
- if validation.get('is_valid'):
597
- content = processor.fetch_content(url)
598
- if content:
599
- results.append({
600
- 'source': 'url',
601
- 'url': url,
602
- 'content': content,
603
- 'timestamp': datetime.now().isoformat()
604
- })
605
- # Process files
606
- if file:
607
- results.extend(file_processor.process_file(file))
608
- # Process text input
609
- if text:
610
- cleaned_text = processor.advanced_text_cleaning(text)
611
- results.append({
612
- 'source': 'direct_input',
613
- 'content': cleaned_text,
614
- 'timestamp': datetime.now().isoformat()
615
- })
616
- # Generate output
617
- if results:
618
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
619
- output_dir.mkdir(parents=True, exist_ok=True)
620
- output_path = output_dir / f'processed_{int(time.time())}.json'
621
-
622
- with open(output_path, 'w', encoding='utf-8') as f:
623
- json.dump(results, f, ensure_ascii=False, indent=2)
624
- summary = f"Processed {len(results)} items successfully!"
625
- json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
626
- return str(output_path), summary, json_data # Return JSON for editor
627
- else:
628
- return None, "No valid content to process.", ""
629
- except Exception as e:
630
- logger.error(f"Processing error: {e}")
631
- return None, f"Error: {str(e)}", ""
632
-
633
-
634
- # Also move generate_qr_code outside of the FileProcessor class
635
- def generate_qr_code(json_data):
636
- """Generate QR code from JSON data and return the file path."""
637
- if json_data:
638
- return generate_qr(json_data)
639
-
640
-
641
- # Move generate_qr outside of the FileProcessor class as well
642
- def generate_qr(json_data):
643
- """Generate QR code from JSON data and return the file path."""
644
- try:
645
- # Try first with automatic version selection
646
- qr = qrcode.QRCode(
647
- error_correction=qrcode.constants.ERROR_CORRECT_L,
648
- box_size=10,
649
- border=4,
650
- )
651
- qr.add_data(json_data)
652
- qr.make(fit=True)
653
-
654
- img = qrcode.make_image(fill_color="black", back_color="white")
655
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
656
- img.save(temp_file.name)
657
- return temp_file.name
658
- except Exception as e:
659
- # If the data is too large for a QR code
660
- logger.error(f"QR generation error: {e}")
661
-
662
- # Create a simple QR with error message
663
- qr = qrcode.QRCode(
664
- version=1,
665
- error_correction=qrcode.constants.ERROR_CORRECT_L,
666
- box_size=10,
667
- border=4,
668
- )
669
- qr.add_data("Error: Data too large for QR code")
670
- qr.make(fit=True)
671
-
672
- img = qrcode.make_image(fill_color="black", back_color="white")
673
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
674
- img.save(temp_file.name)
675
- return temp_file.name
676
-
677
 
678
  def create_interface():
679
- """Create a comprehensive Gradio interface with advanced features"""
680
  css = """
681
  .container { max-width: 1200px; margin: auto; }
682
  .warning { background-color: #fff3cd; color: #856404; }
683
  .error { background-color: #f8d7da; color: #721c24; }
684
  """
685
- with gr.Blocks(css=css, title="Advanced Text & URL Processing") as interface:
 
 
 
 
 
 
 
 
686
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
687
 
688
  with gr.Tab("URL Processing"):
@@ -778,30 +265,93 @@ def check_network_connectivity():
778
  return True, results
779
 
780
 
781
- # Add this to the main function
782
- def main():
783
- # Configure system settings
784
- mimetypes.init()
785
-
786
- # Check network connectivity
787
- network_ok, network_results = check_network_connectivity()
788
- if not network_ok:
789
- logger.warning("Network connectivity issues detected. Some features may not work properly.")
790
- for result in network_results:
791
- logger.warning(f"Test site {result['site']}: {result['status']}")
792
- # Create and launch interface
793
- interface = create_interface()
794
-
795
- # Launch with proper configuration
796
- interface.launch(
797
- server_name="0.0.0.0",
798
- server_port=7860,
799
- show_error=True,
800
- share=False,
801
- inbrowser=True,
802
- debug=True
803
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
805
 
806
  if __name__ == "__main__":
807
  main()
 
30
  logger = logging.getLogger(__name__)
31
 
32
 
33
+ # Add these imports at the top
34
+ from config import Config
35
+ from proxy_handler import ProxyHandler
36
+ from robots_handler import RobotsHandler
37
+ import asyncio
38
+ import aiohttp
39
+ from tqdm import tqdm
40
+
41
+ # Add new imports for rate limiting and testing
42
+ from ratelimit import limits, sleep_and_retry
43
+ from typing import Dict, Any, Optional, List
44
+ import pytest
45
+ from urllib.robotparser import RobotFileParser
46
+ import concurrent.futures
47
+
48
  class URLProcessor:
49
  def __init__(self):
50
+ self.config = Config()
51
+ self.proxy_handler = ProxyHandler(self.config.get('PROXY_URL'))
52
+ self.robots_handler = RobotsHandler()
53
+ self.session = self._create_session()
54
+ self.rate_limit = self.config.get('RATE_LIMIT', 60) # requests per minute
55
+ self.timeout = self.config.get('TIMEOUT', 10)
56
+
57
+ @sleep_and_retry
58
+ @limits(calls=60, period=60) # Rate limiting decorator
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  def fetch_content(self, url: str) -> Optional[Dict]:
60
+ """Fetch content with rate limiting"""
61
+ if self.config.get('RESPECT_ROBOTS', True):
62
+ if not self.robots_handler.can_fetch(url):
63
+ logger.warning(f"Skipping {url} - robots.txt disallowed")
64
+ return None
65
+
66
+ def _create_session(self):
67
+ session = requests.Session()
68
+ if self.config.get('USE_PROXY'):
69
+ session.proxies = self.proxy_handler.get_proxy_config()
70
+ session.headers.update({
71
+ 'User-Agent': UserAgent().random,
72
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
73
+ 'Accept-Language': 'en-US,en;q=0.5',
74
+ 'Accept-Encoding': 'gzip, deflate, br',
75
+ 'Connection': 'keep-alive',
76
+ 'Upgrade-Insecure-Requests': '1'
77
+ })
78
+ return session
79
+
80
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
81
+ try:
82
+ chrome_options = Options()
83
+ from selenium import webdriver
84
+ from selenium.webdriver.chrome.options import Options
85
+ from selenium.webdriver.common.by import By
86
+ from selenium.webdriver.support.ui import WebDriverWait
87
+ from selenium.webdriver.support import expected_conditions as EC
88
+ from selenium.common.exceptions import TimeoutException
89
+ import time
90
+
91
+ logger.info(f"Attempting to fetch {url} with Selenium")
92
+
93
+ # Set up Chrome options
94
+ chrome_options = Options()
95
+ chrome_options.add_argument("--headless")
96
+ chrome_options.add_argument("--no-sandbox")
97
+ chrome_options.add_argument("--disable-dev-shm-usage")
98
+ chrome_options.add_argument("--disable-gpu")
99
+ chrome_options.add_argument("--window-size=1920,1080")
100
+ chrome_options.add_argument(
101
+ "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
102
+
103
+ # Initialize the driver
104
+ driver = webdriver.Chrome(options=chrome_options)
105
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  try:
107
+ # Navigate to the URL
108
+ driver.get(url)
109
+
110
+ # Wait for the page to load
111
+ WebDriverWait(driver, 10).until(
112
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
113
+ )
114
+
115
+ # Simulate pressing ESC key to dismiss overlays
116
+ from selenium.webdriver.common.keys import Keys
117
+ action_chains = webdriver.ActionChains(driver)
118
+ action_chains.send_keys(Keys.ESCAPE).perform()
119
+ time.sleep(1) # give it a moment to take effect
120
+ action_chains.reset_actions() # Clear actions
121
+
122
+ # try again
123
+ action_chains.send_keys(Keys.ESCAPE).perform()
124
+ time.sleep(1) # give it a moment to take effect
125
+ action_chains.reset_actions()
126
+
127
+ # Get the page source
128
+ page_source = driver.page_source
129
+
130
+ # Save the Selenium HTML for debugging
131
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
132
+ with open(debug_path, "w", encoding="utf-8") as f:
133
+ f.write(page_source)
134
+ logger.info(f"Saved Selenium HTML to {debug_path}")
135
+
136
+ return page_source
137
+ finally:
138
+ driver.quit()
139
+
140
+ except ImportError:
141
+ logger.error("Selenium is not installed. Cannot use browser automation.")
142
  return None
143
+ except Exception as e:
144
+ logger.error(f"Selenium processing failed for {url}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  return None
146
+
147
+ async def fetch_urls_async(self, urls: List[str]) -> List[Dict]:
148
+ """Asynchronous URL fetching with rate limiting"""
149
+ async with aiohttp.ClientSession() as session:
150
+ tasks = []
151
+ for url in urls:
152
+ if len(tasks) >= self.rate_limit:
153
+ await asyncio.sleep(60) # Rate limiting
154
+ tasks = []
155
+ tasks.append(self.fetch_content_async(session, url))
156
+ return await asyncio.gather(*tasks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
 
158
  def create_interface():
 
159
  css = """
160
  .container { max-width: 1200px; margin: auto; }
161
  .warning { background-color: #fff3cd; color: #856404; }
162
  .error { background-color: #f8d7da; color: #721c24; }
163
  """
164
+
165
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
166
+ with gr.Tab("Settings"):
167
+ respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
168
+ use_proxy = gr.Checkbox(label="Use Proxy", value=False)
169
+ proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
170
+ request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
171
+ output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
172
+
173
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
174
 
175
  with gr.Tab("URL Processing"):
 
265
  return True, results
266
 
267
 
268
+ def validate_config(config: Dict[str, Any]) -> Dict[str, str]:
269
+ """Validate configuration settings"""
270
+ errors = {}
271
+ if config.get('RATE_LIMIT', 0) < 1:
272
+ errors['rate_limit'] = "Rate limit must be positive"
273
+ if config.get('TIMEOUT', 0) < 1:
274
+ errors['timeout'] = "Timeout must be positive"
275
+ if config.get('USE_PROXY') and not config.get('PROXY_URL'):
276
+ errors['proxy'] = "Proxy URL required when proxy is enabled"
277
+ return errors
278
+
279
+ def update_settings(respect_robots: bool, use_proxy: bool, proxy_url: str,
280
+ request_delay: float, output_format: str) -> str:
281
+ """Update application settings"""
282
+ config = Config()
283
+ new_settings = {
284
+ 'RESPECT_ROBOTS': respect_robots,
285
+ 'USE_PROXY': use_proxy,
286
+ 'PROXY_URL': proxy_url,
287
+ 'REQUEST_DELAY': request_delay,
288
+ 'OUTPUT_FORMAT': output_format
289
+ }
290
+
291
+ # Validate settings before updating
292
+ errors = validate_config(new_settings)
293
+ if errors:
294
+ return f"Configuration error: {', '.join(errors.values())}"
295
+
296
+ config.update(new_settings)
297
+ return "Configuration updated successfully"
298
+
299
+ def create_settings_tab() -> gr.Tab:
300
+ """Create settings tab with configuration controls"""
301
+ with gr.Tab("Settings") as settings_tab:
302
+ respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
303
+ use_proxy = gr.Checkbox(label="Use Proxy", value=False)
304
+ proxy_url = gr.Textbox(label="Proxy URL", placeholder="http://proxy:port")
305
+ request_delay = gr.Slider(minimum=0, maximum=10, value=1, label="Request Delay (seconds)")
306
+ output_format = gr.Dropdown(choices=["json", "csv", "txt"], value="json", label="Output Format")
307
+
308
+ settings_btn = gr.Button("Update Settings")
309
+ settings_output = gr.Textbox(label="Settings Status")
310
+
311
+ settings_btn.click(
312
+ update_settings,
313
+ inputs=[respect_robots, use_proxy, proxy_url, request_delay, output_format],
314
+ outputs=settings_output
315
+ )
316
+
317
+ return settings_tab
318
 
319
+ def main():
320
+ """Main application entry point"""
321
+ try:
322
+ # Initialize system settings
323
+ mimetypes.init()
324
+
325
+ # Validate initial configuration
326
+ config = Config()
327
+ errors = validate_config(config.get_all())
328
+ if errors:
329
+ logger.error(f"Configuration errors found: {errors}")
330
+ sys.exit(1)
331
+
332
+ # Check network connectivity
333
+ network_ok, network_results = check_network_connectivity()
334
+ if not network_ok:
335
+ logger.warning("Network connectivity issues detected. Some features may not work properly.")
336
+ for result in network_results:
337
+ logger.warning(f"Test site {result['site']}: {result['status']}")
338
+
339
+ # Create and launch interface
340
+ interface = create_interface()
341
+
342
+ # Launch with proper configuration
343
+ interface.launch(
344
+ server_name="0.0.0.0",
345
+ server_port=7860,
346
+ show_error=True,
347
+ share=False,
348
+ inbrowser=True,
349
+ debug=True
350
+ )
351
+
352
+ except Exception as e:
353
+ logger.error(f"Application startup failed: {str(e)}")
354
+ sys.exit(1)
355
 
356
  if __name__ == "__main__":
357
  main()