acecalisto3 commited on
Commit
044ca3f
·
verified ·
1 Parent(s): 1206535

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -150
app.py CHANGED
@@ -72,7 +72,6 @@ class URLProcessor:
72
  try:
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
75
-
76
  # Try with DNS resolution retry
77
  for attempt in range(3): # Try up to 3 times
78
  try:
@@ -87,21 +86,20 @@ class URLProcessor:
87
  response.raise_for_status()
88
  # Close the connection to avoid downloading the entire content
89
  response.close()
90
-
91
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
92
  except requests.exceptions.ConnectionError as e:
93
  if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
94
- logger.warning(f"DNS resolution failed for {url}, attempt {attempt+1}/3")
95
  time.sleep(1) # Wait a bit before retrying
96
  continue
97
  else:
98
  raise
99
  except Exception as e:
100
  raise
101
-
102
  # If we get here, all attempts failed
103
- return {'is_valid': False, 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
104
-
105
  except Exception as e:
106
  logger.error(f"URL validation failed for {url}: {str(e)}")
107
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
@@ -110,51 +108,49 @@ class URLProcessor:
110
  """Universal content fetcher with special case handling"""
111
  try:
112
  logger.info(f"Fetching content from: {url}")
113
-
114
  # Google Drive document handling
115
  if 'drive.google.com' in url:
116
  return self._handle_google_drive(url)
117
-
118
  # Google Calendar ICS handling
119
  if 'calendar.google.com' in url and 'ical' in url:
120
  return self._handle_google_calendar(url)
121
-
122
  # Try standard HTML processing first
123
  result = self._fetch_html_content(url)
124
-
125
  # If standard processing failed or returned minimal content, try with Selenium
126
  if not result or len(result.get('content', '')) < 100:
127
- logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
 
128
  selenium_html = self._fetch_with_selenium(url)
129
-
130
  if selenium_html:
131
  # Process the Selenium HTML
132
  soup = BeautifulSoup(selenium_html, 'html.parser')
133
-
134
  # Remove unwanted elements
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
-
138
  # Apply the same content extraction strategies as in _fetch_html_content
139
  # Strategy 1: Look for semantic HTML5 elements
140
  main_content = None
141
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
 
142
  elements = soup.select(selector)
143
  if elements:
144
  main_content = elements[0]
145
  logger.info(f"Found content with selector: {selector}")
146
  break
147
-
148
  # If no main content found, use body
149
  if not main_content or not main_content.get_text(strip=True):
150
  main_content = soup.body if soup.body else soup
151
-
152
  # Extract text
153
  text_content = main_content.get_text(separator='\n', strip=True)
154
-
155
  # Clean content
156
  cleaned_content = self.advanced_text_cleaning(text_content)
157
-
158
  if len(cleaned_content) >= 20:
159
  result = {
160
  'content': cleaned_content,
@@ -163,13 +159,11 @@ class URLProcessor:
163
  'url': url,
164
  'source': 'selenium' # Mark that this came from Selenium
165
  }
166
-
167
- # Log the result status
168
  if result:
169
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
170
  else:
171
  logger.error(f"Failed to extract content from {url}")
172
-
173
  return result
174
  except Exception as e:
175
  logger.error(f"Content fetch failed for {url}: {e}")
@@ -179,7 +173,8 @@ class URLProcessor:
179
  """Standard HTML content processing"""
180
  try:
181
  # Try with a different user agent if it's a social media site
182
- if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
 
183
  # Use a more realistic browser user agent instead of random one
184
  self.session.headers.update({
185
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@@ -199,10 +194,10 @@ class URLProcessor:
199
  if 'facebook.com' in url and 'm.facebook.com' not in url:
200
  url = url.replace('www.facebook.com', 'm.facebook.com')
201
  logger.info(f"Switched to mobile Facebook URL: {url}")
202
-
203
- # Add a delay to simulate human browsing
204
- time.sleep(1)
205
-
206
  # Try to get the page with multiple attempts
207
  max_attempts = 3
208
  for attempt in range(max_attempts):
@@ -212,34 +207,32 @@ class URLProcessor:
212
  break
213
  except (requests.exceptions.RequestException, Exception) as e:
214
  if attempt < max_attempts - 1:
215
- logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
216
  time.sleep(2) # Wait longer between retries
217
  else:
218
  raise
219
-
220
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
221
-
222
  # Save the raw HTML for debugging if needed
223
  debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
224
  with open(debug_path, "w", encoding="utf-8") as f:
225
  f.write(response.text)
226
  logger.info(f"Saved raw HTML to {debug_path}")
227
-
228
  # Check if we got a valid response with content
229
  if not response.text or len(response.text) < 100:
230
  logger.error(f"Empty or very short response from {url}")
231
  return None
232
-
233
- soup = BeautifulSoup(response.text, 'html.parser')
234
 
 
235
  # Remove unwanted elements
236
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
237
  element.decompose()
238
-
239
  # Simulate "ESC key" by removing login walls and overlays common on social media sites
240
  login_wall_selectors = [
241
- '.login-wall', '.signup-wall', '.overlay', '.modal',
242
- '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
243
  '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
244
  '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
245
  '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
@@ -250,85 +243,89 @@ class URLProcessor:
250
  for element in soup.select(selector):
251
  logger.info(f"Removing login wall element: {selector}")
252
  element.decompose()
253
-
254
  # Enhanced removal for social media sites
255
  if 'facebook.com' in url:
256
  # Facebook specific elements - simulating ESC key
257
  fb_selectors = [
258
- '[data-testid="cookie-policy-manage-dialog"]',
259
- '[role="banner"]', '[role="complementary"]',
260
- '.login_form_container', '.login_form', '#login_form',
261
- '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
262
- '._5hn6', '._67m7', '.nonLoggedInSignUp',
263
  '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
264
  ]
265
  for selector in fb_selectors:
266
  for element in soup.select(selector):
267
  element.decompose()
268
-
269
  # Look for the main content in mobile version
270
- main_content = soup.select_one('#m_story_permalink_view') or soup.select_one('#mobile_injected_video_feed_pagelet')
 
271
  if main_content:
272
  logger.info("Found Facebook mobile main content")
273
-
274
  elif 'instagram.com' in url:
275
  # Instagram specific elements - simulating ESC key
276
  ig_selectors = [
277
- '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
278
- '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
279
- '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
280
  '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
281
  '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
282
  ]
283
  for selector in ig_selectors:
284
  for element in soup.select(selector):
285
  element.decompose()
286
-
287
  # Try to find the main content
288
- insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one('._ab1y')
 
289
  if insta_content:
290
  logger.info("Found Instagram main content")
291
-
292
  elif 'twitter.com' in url or 'x.com' in url:
293
  # X/Twitter already works well for public content, but clean up any remaining overlays
294
  x_selectors = [
295
  '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
296
  '[data-testid="sheetDialog"]', '[data-testid="mask"]',
297
- '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
298
- '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
299
  '.r-kemksi', '[data-testid="BottomBar"]'
300
  ]
301
  for selector in x_selectors:
302
  for element in soup.select(selector):
303
  element.decompose()
304
-
305
  elif 'huggingface.co' in url:
306
  # Special handling for Hugging Face
307
  logger.info("Applying special handling for Hugging Face")
308
  # Try to find the main content
309
- hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
 
310
  for selector in hf_selectors:
311
  elements = soup.select(selector)
312
  if elements:
313
  logger.info(f"Found Hugging Face content with selector: {selector}")
314
  break
315
-
316
  # Extract content using a general approach - try multiple strategies
317
  # Strategy 1: Look for semantic HTML5 elements
318
  main_content = None
319
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
 
320
  elements = soup.select(selector)
321
  if elements:
322
  main_content = elements[0]
323
  logger.info(f"Found content with selector: {selector}")
324
  break
325
-
326
  # Strategy 2: If no semantic elements, try common class names
327
  if not main_content or not main_content.get_text(strip=True):
328
  for div in soup.find_all('div'):
329
  class_name = div.get('class', [])
330
  id_name = div.get('id', '')
331
- if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
 
332
  main_content = div
333
  logger.info(f"Found content with div class: {class_name}")
334
  break
@@ -336,15 +333,15 @@ class URLProcessor:
336
  main_content = div
337
  logger.info(f"Found content with div id: {id_name}")
338
  break
339
-
340
  # Strategy 3: Fall back to body
341
  if not main_content or not main_content.get_text(strip=True):
342
  logger.info(f"No main content container found for {url}, using body")
343
  main_content = soup.body if soup.body else soup
344
-
345
  # Extract text with proper spacing
346
  text_content = main_content.get_text(separator='\n', strip=True)
347
-
348
  # Strategy 4: If content is too short, extract all visible text
349
  if len(text_content) < 100:
350
  logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
@@ -353,17 +350,16 @@ class URLProcessor:
353
  if element.get_text(strip=True):
354
  visible_text.append(element.get_text(strip=True))
355
  text_content = '\n'.join(visible_text)
356
-
357
  # Strategy 5: Last resort - get all text from the page
358
  if len(text_content) < 50:
359
  logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
360
  text_content = soup.get_text(separator='\n', strip=True)
361
-
362
  # Clean and structure content
363
  cleaned_content = self.advanced_text_cleaning(text_content)
364
-
365
  logger.info(f"Final content length: {len(cleaned_content)} chars")
366
-
367
  # If we still have no content, this is a failure
368
  if len(cleaned_content) < 20:
369
  logger.error(f"Failed to extract meaningful content from {url}")
@@ -379,6 +375,117 @@ class URLProcessor:
379
  logger.error(f"HTML processing failed for {url}: {e}")
380
  return None
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  class FileProcessor:
384
  """Class to handle file processing"""
@@ -411,7 +518,7 @@ class FileProcessor:
411
  dataset.extend(self._process_zip_file(file.name, temp_dir))
412
  else:
413
  dataset.extend(self._process_single_file(file))
414
- except Exception as e:
415
  logger.error(f"Error processing file: {str(e)}")
416
  return []
417
  return dataset
@@ -436,17 +543,14 @@ class FileProcessor:
436
  def _process_single_file(self, file) -> List[Dict]:
437
  try:
438
  file_stat = os.stat(file.name)
439
-
440
  # For very large files, read in chunks and summarize
441
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
442
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
443
-
444
  # Read first and last 1MB for extremely large files
445
  content = ""
446
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
447
  content = f.read(1 * 1024 * 1024) # First 1MB
448
  content += "\n...[Content truncated due to large file size]...\n"
449
-
450
  # Seek to the last 1MB
451
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
452
  content += f.read() # Last 1MB
@@ -454,7 +558,6 @@ class FileProcessor:
454
  # Regular file processing
455
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
456
  content = f.read()
457
-
458
  return [{
459
  'source': 'file',
460
  'filename': os.path.basename(file.name),
@@ -469,6 +572,7 @@ class FileProcessor:
469
  logger.error(f"File processing error: {e}")
470
  return []
471
 
 
472
  # Move process_all_inputs outside of the FileProcessor class
473
  def process_all_inputs(urls, file, text, notes):
474
  """Process all input types with progress tracking"""
@@ -493,11 +597,9 @@ def process_all_inputs(urls, file, text, notes):
493
  'content': content,
494
  'timestamp': datetime.now().isoformat()
495
  })
496
-
497
  # Process files
498
  if file:
499
  results.extend(file_processor.process_file(file))
500
-
501
  # Process text input
502
  if text:
503
  cleaned_text = processor.advanced_text_cleaning(text)
@@ -506,7 +608,6 @@ def process_all_inputs(urls, file, text, notes):
506
  'content': cleaned_text,
507
  'timestamp': datetime.now().isoformat()
508
  })
509
-
510
  # Generate output
511
  if results:
512
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
@@ -515,23 +616,23 @@ def process_all_inputs(urls, file, text, notes):
515
 
516
  with open(output_path, 'w', encoding='utf-8') as f:
517
  json.dump(results, f, ensure_ascii=False, indent=2)
518
-
519
  summary = f"Processed {len(results)} items successfully!"
520
  json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
521
  return str(output_path), summary, json_data # Return JSON for editor
522
  else:
523
  return None, "No valid content to process.", ""
524
-
525
  except Exception as e:
526
  logger.error(f"Processing error: {e}")
527
  return None, f"Error: {str(e)}", ""
528
 
 
529
  # Also move generate_qr_code outside of the FileProcessor class
530
  def generate_qr_code(json_data):
531
  """Generate QR code from JSON data and return the file path."""
532
  if json_data:
533
  return generate_qr(json_data)
534
 
 
535
  # Move generate_qr outside of the FileProcessor class as well
536
  def generate_qr(json_data):
537
  """Generate QR code from JSON data and return the file path."""
@@ -544,7 +645,7 @@ def generate_qr(json_data):
544
  )
545
  qr.add_data(json_data)
546
  qr.make(fit=True)
547
-
548
  img = qrcode.make_image(fill_color="black", back_color="white")
549
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
550
  img.save(temp_file.name)
@@ -552,7 +653,7 @@ def generate_qr(json_data):
552
  except Exception as e:
553
  # If the data is too large for a QR code
554
  logger.error(f"QR generation error: {e}")
555
-
556
  # Create a simple QR with error message
557
  qr = qrcode.QRCode(
558
  version=1,
@@ -562,12 +663,13 @@ def generate_qr(json_data):
562
  )
563
  qr.add_data("Error: Data too large for QR code")
564
  qr.make(fit=True)
565
-
566
  img = qrcode.make_image(fill_color="black", back_color="white")
567
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
568
  img.save(temp_file.name)
569
  return temp_file.name
570
 
 
571
  def create_interface():
572
  """Create a comprehensive Gradio interface with advanced features"""
573
  css = """
@@ -575,7 +677,6 @@ def create_interface():
575
  .warning { background-color: #fff3cd; color: #856404; }
576
  .error { background-color: #f8d7da; color: #721c24; }
577
  """
578
-
579
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
580
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
581
 
@@ -628,13 +729,11 @@ def create_interface():
628
  inputs=[url_input, file_input, text_input, scratchpad],
629
  outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
630
  )
631
-
632
  qr_btn.click(
633
  generate_qr_code,
634
  inputs=json_editor,
635
  outputs=qr_output
636
  )
637
-
638
  gr.Markdown("""
639
  ### Usage Guidelines
640
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
@@ -646,11 +745,12 @@ def create_interface():
646
  """)
647
  return interface
648
 
 
649
  def check_network_connectivity():
650
  """Check if the network is working properly by testing connection to common sites"""
651
  test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
652
  results = []
653
-
654
  for site in test_sites:
655
  try:
656
  response = requests.get(site, timeout=5)
@@ -665,29 +765,28 @@ def check_network_connectivity():
665
  "status": f"Error: {str(e)}",
666
  "response_time": None
667
  })
668
-
669
  # If all sites failed, there might be a network issue
670
  if all(result["status"].startswith("Error") for result in results):
671
  logger.error("Network connectivity issue detected. All test sites failed.")
672
  return False, results
673
-
674
  return True, results
675
 
 
676
  # Add this to the main function
677
  def main():
678
  # Configure system settings
679
  mimetypes.init()
680
-
681
  # Check network connectivity
682
  network_ok, network_results = check_network_connectivity()
683
  if not network_ok:
684
  logger.warning("Network connectivity issues detected. Some features may not work properly.")
685
  for result in network_results:
686
  logger.warning(f"Test site {result['site']}: {result['status']}")
687
-
688
  # Create and launch interface
689
  interface = create_interface()
690
-
691
  # Launch with proper configuration
692
  interface.launch(
693
  server_name="0.0.0.0",
@@ -698,67 +797,6 @@ def main():
698
  debug=True
699
  )
700
 
 
701
  if __name__ == "__main__":
702
  main()
703
-
704
-
705
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
706
- """Use Selenium as a fallback for difficult sites"""
707
- try:
708
- from selenium import webdriver
709
- from selenium.webdriver.chrome.options import Options
710
- from selenium.webdriver.common.by import By
711
- from selenium.webdriver.support.ui import WebDriverWait
712
- from selenium.webdriver.support import expected_conditions as EC
713
- from selenium.common.exceptions import TimeoutException
714
- import time
715
-
716
- logger.info(f"Attempting to fetch {url} with Selenium")
717
-
718
- # Set up Chrome options
719
- chrome_options = Options()
720
- chrome_options.add_argument("--headless")
721
- chrome_options.add_argument("--no-sandbox")
722
- chrome_options.add_argument("--disable-dev-shm-usage")
723
- chrome_options.add_argument("--disable-gpu")
724
- chrome_options.add_argument("--window-size=1920,1080")
725
- chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
726
-
727
- # Initialize the driver
728
- driver = webdriver.Chrome(options=chrome_options)
729
-
730
- try:
731
- # Navigate to the URL
732
- driver.get(url)
733
-
734
- # Wait for the page to load
735
- WebDriverWait(driver, 10).until(
736
- EC.presence_of_element_located((By.TAG_NAME, "body"))
737
- )
738
-
739
- # Simulate pressing ESC key to dismiss overlays
740
- from selenium.webdriver.common.keys import Keys
741
- webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
742
-
743
- # Wait a bit for any animations to complete
744
- time.sleep(2)
745
-
746
- # Get the page source
747
- page_source = driver.page_source
748
-
749
- # Save the Selenium HTML for debugging
750
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
751
- with open(debug_path, "w", encoding="utf-8") as f:
752
- f.write(page_source)
753
- logger.info(f"Saved Selenium HTML to {debug_path}")
754
-
755
- return page_source
756
- finally:
757
- driver.quit()
758
-
759
- except ImportError:
760
- logger.error("Selenium is not installed. Cannot use browser automation.")
761
- return None
762
- except Exception as e:
763
- logger.error(f"Selenium processing failed for {url}: {e}")
764
- return None
 
72
  try:
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
 
75
  # Try with DNS resolution retry
76
  for attempt in range(3): # Try up to 3 times
77
  try:
 
86
  response.raise_for_status()
87
  # Close the connection to avoid downloading the entire content
88
  response.close()
89
+
90
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
91
  except requests.exceptions.ConnectionError as e:
92
  if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
93
+ logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
94
  time.sleep(1) # Wait a bit before retrying
95
  continue
96
  else:
97
  raise
98
  except Exception as e:
99
  raise
 
100
  # If we get here, all attempts failed
101
+ return {'is_valid': False,
102
+ 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
103
  except Exception as e:
104
  logger.error(f"URL validation failed for {url}: {str(e)}")
105
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
 
108
  """Universal content fetcher with special case handling"""
109
  try:
110
  logger.info(f"Fetching content from: {url}")
111
+
112
  # Google Drive document handling
113
  if 'drive.google.com' in url:
114
  return self._handle_google_drive(url)
 
115
  # Google Calendar ICS handling
116
  if 'calendar.google.com' in url and 'ical' in url:
117
  return self._handle_google_calendar(url)
 
118
  # Try standard HTML processing first
119
  result = self._fetch_html_content(url)
120
+
121
  # If standard processing failed or returned minimal content, try with Selenium
122
  if not result or len(result.get('content', '')) < 100:
123
+ logger.info(
124
+ f"Standard processing failed or returned minimal content for {url}, trying Selenium")
125
  selenium_html = self._fetch_with_selenium(url)
 
126
  if selenium_html:
127
  # Process the Selenium HTML
128
  soup = BeautifulSoup(selenium_html, 'html.parser')
 
129
  # Remove unwanted elements
130
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
131
  element.decompose()
132
+
133
  # Apply the same content extraction strategies as in _fetch_html_content
134
  # Strategy 1: Look for semantic HTML5 elements
135
  main_content = None
136
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
137
+ '.entry', '.page']:
138
  elements = soup.select(selector)
139
  if elements:
140
  main_content = elements[0]
141
  logger.info(f"Found content with selector: {selector}")
142
  break
143
+
144
  # If no main content found, use body
145
  if not main_content or not main_content.get_text(strip=True):
146
  main_content = soup.body if soup.body else soup
147
+
148
  # Extract text
149
  text_content = main_content.get_text(separator='\n', strip=True)
150
+
151
  # Clean content
152
  cleaned_content = self.advanced_text_cleaning(text_content)
153
+
154
  if len(cleaned_content) >= 20:
155
  result = {
156
  'content': cleaned_content,
 
159
  'url': url,
160
  'source': 'selenium' # Mark that this came from Selenium
161
  }
162
+ # Log the result status
 
163
  if result:
164
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
165
  else:
166
  logger.error(f"Failed to extract content from {url}")
 
167
  return result
168
  except Exception as e:
169
  logger.error(f"Content fetch failed for {url}: {e}")
 
173
  """Standard HTML content processing"""
174
  try:
175
  # Try with a different user agent if it's a social media site
176
+ if any(domain in url for domain in
177
+ ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
178
  # Use a more realistic browser user agent instead of random one
179
  self.session.headers.update({
180
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 
194
  if 'facebook.com' in url and 'm.facebook.com' not in url:
195
  url = url.replace('www.facebook.com', 'm.facebook.com')
196
  logger.info(f"Switched to mobile Facebook URL: {url}")
197
+
198
+ # Add a delay to simulate human browsing
199
+ time.sleep(1)
200
+
201
  # Try to get the page with multiple attempts
202
  max_attempts = 3
203
  for attempt in range(max_attempts):
 
207
  break
208
  except (requests.exceptions.RequestException, Exception) as e:
209
  if attempt < max_attempts - 1:
210
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
211
  time.sleep(2) # Wait longer between retries
212
  else:
213
  raise
 
214
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
215
+
216
  # Save the raw HTML for debugging if needed
217
  debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
218
  with open(debug_path, "w", encoding="utf-8") as f:
219
  f.write(response.text)
220
  logger.info(f"Saved raw HTML to {debug_path}")
221
+
222
  # Check if we got a valid response with content
223
  if not response.text or len(response.text) < 100:
224
  logger.error(f"Empty or very short response from {url}")
225
  return None
 
 
226
 
227
+ soup = BeautifulSoup(response.text, 'html.parser')
228
  # Remove unwanted elements
229
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
230
  element.decompose()
231
+
232
  # Simulate "ESC key" by removing login walls and overlays common on social media sites
233
  login_wall_selectors = [
234
+ '.login-wall', '.signup-wall', '.overlay', '.modal',
235
+ '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
236
  '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
237
  '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
238
  '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
 
243
  for element in soup.select(selector):
244
  logger.info(f"Removing login wall element: {selector}")
245
  element.decompose()
246
+
247
  # Enhanced removal for social media sites
248
  if 'facebook.com' in url:
249
  # Facebook specific elements - simulating ESC key
250
  fb_selectors = [
251
+ '[data-testid="cookie-policy-manage-dialog"]',
252
+ '[role="banner"]', '[role="complementary"]',
253
+ '.login_form_container', '.login_form', '#login_form',
254
+ '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
255
+ '._5hn6', '._67m7', '.nonLoggedInSignUp',
256
  '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
257
  ]
258
  for selector in fb_selectors:
259
  for element in soup.select(selector):
260
  element.decompose()
261
+
262
  # Look for the main content in mobile version
263
+ main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
264
+ '#mobile_injected_video_feed_pagelet')
265
  if main_content:
266
  logger.info("Found Facebook mobile main content")
267
+
268
  elif 'instagram.com' in url:
269
  # Instagram specific elements - simulating ESC key
270
  ig_selectors = [
271
+ '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
272
+ '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
273
+ '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
274
  '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
275
  '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
276
  ]
277
  for selector in ig_selectors:
278
  for element in soup.select(selector):
279
  element.decompose()
280
+
281
  # Try to find the main content
282
+ insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
283
+ '._ab1y')
284
  if insta_content:
285
  logger.info("Found Instagram main content")
286
+
287
  elif 'twitter.com' in url or 'x.com' in url:
288
  # X/Twitter already works well for public content, but clean up any remaining overlays
289
  x_selectors = [
290
  '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
291
  '[data-testid="sheetDialog"]', '[data-testid="mask"]',
292
+ '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
293
+ '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
294
  '.r-kemksi', '[data-testid="BottomBar"]'
295
  ]
296
  for selector in x_selectors:
297
  for element in soup.select(selector):
298
  element.decompose()
299
+
300
  elif 'huggingface.co' in url:
301
  # Special handling for Hugging Face
302
  logger.info("Applying special handling for Hugging Face")
303
  # Try to find the main content
304
+ hf_selectors = ['.prose', '.space-content', '.model-description',
305
+ '.dataset-description', 'article', '.markdown']
306
  for selector in hf_selectors:
307
  elements = soup.select(selector)
308
  if elements:
309
  logger.info(f"Found Hugging Face content with selector: {selector}")
310
  break
 
311
  # Extract content using a general approach - try multiple strategies
312
  # Strategy 1: Look for semantic HTML5 elements
313
  main_content = None
314
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
315
+ '.page']:
316
  elements = soup.select(selector)
317
  if elements:
318
  main_content = elements[0]
319
  logger.info(f"Found content with selector: {selector}")
320
  break
321
+
322
  # Strategy 2: If no semantic elements, try common class names
323
  if not main_content or not main_content.get_text(strip=True):
324
  for div in soup.find_all('div'):
325
  class_name = div.get('class', [])
326
  id_name = div.get('id', '')
327
+ if any(term in ' '.join(class_name).lower() for term in
328
+ ['content', 'main', 'body', 'article', 'post']):
329
  main_content = div
330
  logger.info(f"Found content with div class: {class_name}")
331
  break
 
333
  main_content = div
334
  logger.info(f"Found content with div id: {id_name}")
335
  break
336
+
337
  # Strategy 3: Fall back to body
338
  if not main_content or not main_content.get_text(strip=True):
339
  logger.info(f"No main content container found for {url}, using body")
340
  main_content = soup.body if soup.body else soup
341
+
342
  # Extract text with proper spacing
343
  text_content = main_content.get_text(separator='\n', strip=True)
344
+
345
  # Strategy 4: If content is too short, extract all visible text
346
  if len(text_content) < 100:
347
  logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
 
350
  if element.get_text(strip=True):
351
  visible_text.append(element.get_text(strip=True))
352
  text_content = '\n'.join(visible_text)
353
+
354
  # Strategy 5: Last resort - get all text from the page
355
  if len(text_content) < 50:
356
  logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
357
  text_content = soup.get_text(separator='\n', strip=True)
 
358
  # Clean and structure content
359
  cleaned_content = self.advanced_text_cleaning(text_content)
360
+
361
  logger.info(f"Final content length: {len(cleaned_content)} chars")
362
+
363
  # If we still have no content, this is a failure
364
  if len(cleaned_content) < 20:
365
  logger.error(f"Failed to extract meaningful content from {url}")
 
375
  logger.error(f"HTML processing failed for {url}: {e}")
376
  return None
377
 
378
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
379
+ """Handle Google Drive document URLs"""
380
+ try:
381
+ # Construct direct download URL
382
+ file_id = url.split("/d/")[1].split("/")[0]
383
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
384
+ response = self.session.get(download_url, stream=True, timeout=self.timeout)
385
+ response.raise_for_status()
386
+
387
+ # Read content (limit to the first 1MB)
388
+ content = b""
389
+ for chunk in response.iter_content(chunk_size=8192): # 8KB chunks
390
+ content += chunk
391
+ if len(content) > 1024 * 1024: # 1MB limit
392
+ content = content[:1024 * 1024]
393
+ logger.warning(f"Truncated Google Drive file after 1MB")
394
+ break
395
+ text_content = content.decode('utf-8', errors='ignore')
396
+ cleaned_text = self.advanced_text_cleaning(text_content)
397
+
398
+ return {
399
+ 'content': cleaned_text,
400
+ 'content_type': 'text/plain', # Assume plain text for simplicity
401
+ 'timestamp': datetime.now().isoformat(),
402
+ 'url': url,
403
+ 'source': 'google_drive'
404
+ }
405
+ except Exception as e:
406
+ logger.error(f"Error handling Google Drive URL {url}: {e}")
407
+ return None
408
+
409
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
410
+ """Handle Google Calendar ICS URLs"""
411
+ try:
412
+ response = self.session.get(url, timeout=self.timeout)
413
+ response.raise_for_status()
414
+ text_content = response.text
415
+ cleaned_text = self.advanced_text_cleaning(text_content)
416
+ return {
417
+ 'content': cleaned_text,
418
+ 'content_type': 'text/calendar', # Correct MIME type
419
+ 'timestamp': datetime.now().isoformat(),
420
+ 'url': url,
421
+ 'source': 'google_calendar'
422
+ }
423
+ except Exception as e:
424
+ logger.error(f"Error handling Google Calendar URL {url}: {e}")
425
+ return None
426
+
427
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
428
+ """Use Selenium as a fallback for difficult sites"""
429
+ try:
430
+ from selenium import webdriver
431
+ from selenium.webdriver.chrome.options import Options
432
+ from selenium.webdriver.common.by import By
433
+ from selenium.webdriver.support.ui import WebDriverWait
434
+ from selenium.webdriver.support import expected_conditions as EC
435
+ from selenium.common.exceptions import TimeoutException
436
+ import time
437
+
438
+ logger.info(f"Attempting to fetch {url} with Selenium")
439
+
440
+ # Set up Chrome options
441
+ chrome_options = Options()
442
+ chrome_options.add_argument("--headless")
443
+ chrome_options.add_argument("--no-sandbox")
444
+ chrome_options.add_argument("--disable-dev-shm-usage")
445
+ chrome_options.add_argument("--disable-gpu")
446
+ chrome_options.add_argument("--window-size=1920,1080")
447
+ chrome_options.add_argument(
448
+ "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
449
+
450
+ # Initialize the driver
451
+ driver = webdriver.Chrome(options=chrome_options)
452
+
453
+ try:
454
+ # Navigate to the URL
455
+ driver.get(url)
456
+
457
+ # Wait for the page to load
458
+ WebDriverWait(driver, 10).until(
459
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
460
+ )
461
+
462
+ # Simulate pressing ESC key to dismiss overlays
463
+ from selenium.webdriver.common.keys import Keys
464
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
465
+
466
+ # Wait a bit for any animations to complete
467
+ time.sleep(2)
468
+
469
+ # Get the page source
470
+ page_source = driver.page_source
471
+
472
+ # Save the Selenium HTML for debugging
473
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
474
+ with open(debug_path, "w", encoding="utf-8") as f:
475
+ f.write(page_source)
476
+ logger.info(f"Saved Selenium HTML to {debug_path}")
477
+
478
+ return page_source
479
+ finally:
480
+ driver.quit()
481
+
482
+ except ImportError:
483
+ logger.error("Selenium is not installed. Cannot use browser automation.")
484
+ return None
485
+ except Exception as e:
486
+ logger.error(f"Selenium processing failed for {url}: {e}")
487
+ return None
488
+
489
 
490
  class FileProcessor:
491
  """Class to handle file processing"""
 
518
  dataset.extend(self._process_zip_file(file.name, temp_dir))
519
  else:
520
  dataset.extend(self._process_single_file(file))
521
+ exceptException as e:
522
  logger.error(f"Error processing file: {str(e)}")
523
  return []
524
  return dataset
 
543
  def _process_single_file(self, file) -> List[Dict]:
544
  try:
545
  file_stat = os.stat(file.name)
 
546
  # For very large files, read in chunks and summarize
547
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
548
  logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
 
549
  # Read first and last 1MB for extremely large files
550
  content = ""
551
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
552
  content = f.read(1 * 1024 * 1024) # First 1MB
553
  content += "\n...[Content truncated due to large file size]...\n"
 
554
  # Seek to the last 1MB
555
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
556
  content += f.read() # Last 1MB
 
558
  # Regular file processing
559
  with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
560
  content = f.read()
 
561
  return [{
562
  'source': 'file',
563
  'filename': os.path.basename(file.name),
 
572
  logger.error(f"File processing error: {e}")
573
  return []
574
 
575
+
576
  # Move process_all_inputs outside of the FileProcessor class
577
  def process_all_inputs(urls, file, text, notes):
578
  """Process all input types with progress tracking"""
 
597
  'content': content,
598
  'timestamp': datetime.now().isoformat()
599
  })
 
600
  # Process files
601
  if file:
602
  results.extend(file_processor.process_file(file))
 
603
  # Process text input
604
  if text:
605
  cleaned_text = processor.advanced_text_cleaning(text)
 
608
  'content': cleaned_text,
609
  'timestamp': datetime.now().isoformat()
610
  })
 
611
  # Generate output
612
  if results:
613
  output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
 
616
 
617
  with open(output_path, 'w', encoding='utf-8') as f:
618
  json.dump(results, f, ensure_ascii=False, indent=2)
 
619
  summary = f"Processed {len(results)} items successfully!"
620
  json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
621
  return str(output_path), summary, json_data # Return JSON for editor
622
  else:
623
  return None, "No valid content to process.", ""
 
624
  except Exception as e:
625
  logger.error(f"Processing error: {e}")
626
  return None, f"Error: {str(e)}", ""
627
 
628
+
629
  # Also move generate_qr_code outside of the FileProcessor class
630
  def generate_qr_code(json_data):
631
  """Generate QR code from JSON data and return the file path."""
632
  if json_data:
633
  return generate_qr(json_data)
634
 
635
+
636
  # Move generate_qr outside of the FileProcessor class as well
637
  def generate_qr(json_data):
638
  """Generate QR code from JSON data and return the file path."""
 
645
  )
646
  qr.add_data(json_data)
647
  qr.make(fit=True)
648
+
649
  img = qrcode.make_image(fill_color="black", back_color="white")
650
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
651
  img.save(temp_file.name)
 
653
  except Exception as e:
654
  # If the data is too large for a QR code
655
  logger.error(f"QR generation error: {e}")
656
+
657
  # Create a simple QR with error message
658
  qr = qrcode.QRCode(
659
  version=1,
 
663
  )
664
  qr.add_data("Error: Data too large for QR code")
665
  qr.make(fit=True)
666
+
667
  img = qrcode.make_image(fill_color="black", back_color="white")
668
  temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
669
  img.save(temp_file.name)
670
  return temp_file.name
671
 
672
+
673
  def create_interface():
674
  """Create a comprehensive Gradio interface with advanced features"""
675
  css = """
 
677
  .warning { background-color: #fff3cd; color: #856404; }
678
  .error { background-color: #f8d7da; color: #721c24; }
679
  """
 
680
  with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
681
  gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
682
 
 
729
  inputs=[url_input, file_input, text_input, scratchpad],
730
  outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
731
  )
 
732
  qr_btn.click(
733
  generate_qr_code,
734
  inputs=json_editor,
735
  outputs=qr_output
736
  )
 
737
  gr.Markdown("""
738
  ### Usage Guidelines
739
  - **URL Processing**: Enter valid HTTP/HTTPS URLs
 
745
  """)
746
  return interface
747
 
748
+
749
  def check_network_connectivity():
750
  """Check if the network is working properly by testing connection to common sites"""
751
  test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
752
  results = []
753
+
754
  for site in test_sites:
755
  try:
756
  response = requests.get(site, timeout=5)
 
765
  "status": f"Error: {str(e)}",
766
  "response_time": None
767
  })
 
768
  # If all sites failed, there might be a network issue
769
  if all(result["status"].startswith("Error") for result in results):
770
  logger.error("Network connectivity issue detected. All test sites failed.")
771
  return False, results
772
+
773
  return True, results
774
 
775
+
776
  # Add this to the main function
777
  def main():
778
  # Configure system settings
779
  mimetypes.init()
780
+
781
  # Check network connectivity
782
  network_ok, network_results = check_network_connectivity()
783
  if not network_ok:
784
  logger.warning("Network connectivity issues detected. Some features may not work properly.")
785
  for result in network_results:
786
  logger.warning(f"Test site {result['site']}: {result['status']}")
 
787
  # Create and launch interface
788
  interface = create_interface()
789
+
790
  # Launch with proper configuration
791
  interface.launch(
792
  server_name="0.0.0.0",
 
797
  debug=True
798
  )
799
 
800
+
801
  if __name__ == "__main__":
802
  main()