acecalisto3 commited on
Commit
6098474
·
verified ·
1 Parent(s): b080143

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +179 -422
app.py CHANGED
@@ -72,7 +72,6 @@ class URLProcessor:
72
  try:
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
75
-
76
  # Try with DNS resolution retry
77
  for attempt in range(3): # Try up to 3 times
78
  try:
@@ -87,21 +86,20 @@ class URLProcessor:
87
  response.raise_for_status()
88
  # Close the connection to avoid downloading the entire content
89
  response.close()
90
-
91
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
92
  except requests.exceptions.ConnectionError as e:
93
  if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
94
- logger.warning(f"DNS resolution failed for {url}, attempt {attempt+1}/3")
95
  time.sleep(1) # Wait a bit before retrying
96
  continue
97
  else:
98
  raise
99
  except Exception as e:
100
  raise
101
-
102
  # If we get here, all attempts failed
103
- return {'is_valid': False, 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
104
-
105
  except Exception as e:
106
  logger.error(f"URL validation failed for {url}: {str(e)}")
107
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
@@ -110,51 +108,49 @@ class URLProcessor:
110
  """Universal content fetcher with special case handling"""
111
  try:
112
  logger.info(f"Fetching content from: {url}")
113
-
114
  # Google Drive document handling
115
  if 'drive.google.com' in url:
116
  return self._handle_google_drive(url)
117
-
118
  # Google Calendar ICS handling
119
  if 'calendar.google.com' in url and 'ical' in url:
120
  return self._handle_google_calendar(url)
121
-
122
  # Try standard HTML processing first
123
  result = self._fetch_html_content(url)
124
-
125
  # If standard processing failed or returned minimal content, try with Selenium
126
  if not result or len(result.get('content', '')) < 100:
127
- logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
 
128
  selenium_html = self._fetch_with_selenium(url)
129
-
130
  if selenium_html:
131
  # Process the Selenium HTML
132
  soup = BeautifulSoup(selenium_html, 'html.parser')
133
-
134
  # Remove unwanted elements
135
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
136
  element.decompose()
137
-
138
  # Apply the same content extraction strategies as in _fetch_html_content
139
  # Strategy 1: Look for semantic HTML5 elements
140
  main_content = None
141
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
 
142
  elements = soup.select(selector)
143
  if elements:
144
  main_content = elements[0]
145
  logger.info(f"Found content with selector: {selector}")
146
  break
147
-
148
  # If no main content found, use body
149
  if not main_content or not main_content.get_text(strip=True):
150
  main_content = soup.body if soup.body else soup
151
-
152
  # Extract text
153
  text_content = main_content.get_text(separator='\n', strip=True)
154
-
155
  # Clean content
156
  cleaned_content = self.advanced_text_cleaning(text_content)
157
-
158
  if len(cleaned_content) >= 20:
159
  result = {
160
  'content': cleaned_content,
@@ -163,85 +159,22 @@ class URLProcessor:
163
  'url': url,
164
  'source': 'selenium' # Mark that this came from Selenium
165
  }
166
-
167
- # Log the result status
168
  if result:
169
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
170
  else:
171
  logger.error(f"Failed to extract content from {url}")
172
-
173
  return result
174
  except Exception as e:
175
  logger.error(f"Content fetch failed for {url}: {e}")
176
  return None
177
 
178
-
179
- def _fetch_with_selenium(self, url: str) -> Optional[str]:
180
- """Use Selenium as a fallback for difficult sites"""
181
- try:
182
- from selenium import webdriver
183
- from selenium.webdriver.chrome.options import Options
184
- from selenium.webdriver.common.by import By
185
- from selenium.webdriver.support.ui import WebDriverWait
186
- from selenium.webdriver.support import expected_conditions as EC
187
- from selenium.common.exceptions import TimeoutException
188
- import time
189
-
190
- logger.info(f"Attempting to fetch {url} with Selenium")
191
-
192
- # Set up Chrome options
193
- chrome_options = Options()
194
- chrome_options.add_argument("--headless")
195
- chrome_options.add_argument("--no-sandbox")
196
- chrome_options.add_argument("--disable-dev-shm-usage")
197
- chrome_options.add_argument("--disable-gpu")
198
- chrome_options.add_argument("--window-size=1920,1080")
199
- chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
200
-
201
- # Initialize the driver
202
- driver = webdriver.Chrome(options=chrome_options)
203
-
204
- try:
205
- # Navigate to the URL
206
- driver.get(url)
207
-
208
- # Wait for the page to load
209
- WebDriverWait(driver, 10).until(
210
- EC.presence_of_element_located((By.TAG_NAME, "body"))
211
- )
212
-
213
- # Simulate pressing ESC key to dismiss overlays
214
- from selenium.webdriver.common.keys import Keys
215
- webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
216
-
217
- # Wait a bit for any animations to complete
218
- time.sleep(2)
219
-
220
- # Get the page source
221
- page_source = driver.page_source
222
-
223
- # Save the Selenium HTML for debugging
224
- debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
225
- with open(debug_path, "w", encoding="utf-8") as f:
226
- f.write(page_source)
227
- logger.info(f"Saved Selenium HTML to {debug_path}")
228
-
229
- return page_source
230
- finally:
231
- driver.quit()
232
-
233
- except ImportError:
234
- logger.error("Selenium is not installed. Cannot use browser automation.")
235
- return None
236
- except Exception as e:
237
- logger.error(f"Selenium processing failed for {url}: {e}")
238
- return None
239
-
240
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
241
  """Standard HTML content processing"""
242
  try:
243
  # Try with a different user agent if it's a social media site
244
- if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
 
245
  # Use a more realistic browser user agent instead of random one
246
  self.session.headers.update({
247
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
@@ -261,10 +194,10 @@ class URLProcessor:
261
  if 'facebook.com' in url and 'm.facebook.com' not in url:
262
  url = url.replace('www.facebook.com', 'm.facebook.com')
263
  logger.info(f"Switched to mobile Facebook URL: {url}")
264
-
265
- # Add a delay to simulate human browsing
266
- time.sleep(1)
267
-
268
  # Try to get the page with multiple attempts
269
  max_attempts = 3
270
  for attempt in range(max_attempts):
@@ -274,34 +207,32 @@ class URLProcessor:
274
  break
275
  except (requests.exceptions.RequestException, Exception) as e:
276
  if attempt < max_attempts - 1:
277
- logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
278
  time.sleep(2) # Wait longer between retries
279
  else:
280
  raise
281
-
282
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
283
-
284
  # Save the raw HTML for debugging if needed
285
  debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
286
  with open(debug_path, "w", encoding="utf-8") as f:
287
  f.write(response.text)
288
  logger.info(f"Saved raw HTML to {debug_path}")
289
-
290
  # Check if we got a valid response with content
291
  if not response.text or len(response.text) < 100:
292
  logger.error(f"Empty or very short response from {url}")
293
  return None
294
-
295
- soup = BeautifulSoup(response.text, 'html.parser')
296
 
 
297
  # Remove unwanted elements
298
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
299
  element.decompose()
300
-
301
  # Simulate "ESC key" by removing login walls and overlays common on social media sites
302
  login_wall_selectors = [
303
- '.login-wall', '.signup-wall', '.overlay', '.modal',
304
- '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
305
  '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
306
  '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
307
  '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
@@ -312,85 +243,89 @@ class URLProcessor:
312
  for element in soup.select(selector):
313
  logger.info(f"Removing login wall element: {selector}")
314
  element.decompose()
315
-
316
  # Enhanced removal for social media sites
317
  if 'facebook.com' in url:
318
  # Facebook specific elements - simulating ESC key
319
  fb_selectors = [
320
- '[data-testid="cookie-policy-manage-dialog"]',
321
- '[role="banner"]', '[role="complementary"]',
322
- '.login_form_container', '.login_form', '#login_form',
323
- '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
324
- '._5hn6', '._67m7', '.nonLoggedInSignUp',
325
  '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
326
  ]
327
  for selector in fb_selectors:
328
  for element in soup.select(selector):
329
  element.decompose()
330
-
331
  # Look for the main content in mobile version
332
- main_content = soup.select_one('#m_story_permalink_view') or soup.select_one('#mobile_injected_video_feed_pagelet')
 
333
  if main_content:
334
  logger.info("Found Facebook mobile main content")
335
-
336
  elif 'instagram.com' in url:
337
  # Instagram specific elements - simulating ESC key
338
  ig_selectors = [
339
- '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
340
- '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
341
- '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
342
  '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
343
  '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
344
  ]
345
  for selector in ig_selectors:
346
  for element in soup.select(selector):
347
  element.decompose()
348
-
349
  # Try to find the main content
350
- insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one('._ab1y')
 
351
  if insta_content:
352
  logger.info("Found Instagram main content")
353
-
354
  elif 'twitter.com' in url or 'x.com' in url:
355
  # X/Twitter already works well for public content, but clean up any remaining overlays
356
  x_selectors = [
357
  '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
358
  '[data-testid="sheetDialog"]', '[data-testid="mask"]',
359
- '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
360
- '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
361
  '.r-kemksi', '[data-testid="BottomBar"]'
362
  ]
363
  for selector in x_selectors:
364
  for element in soup.select(selector):
365
  element.decompose()
366
-
367
  elif 'huggingface.co' in url:
368
  # Special handling for Hugging Face
369
  logger.info("Applying special handling for Hugging Face")
370
  # Try to find the main content
371
- hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
 
372
  for selector in hf_selectors:
373
  elements = soup.select(selector)
374
  if elements:
375
  logger.info(f"Found Hugging Face content with selector: {selector}")
376
  break
377
-
378
  # Extract content using a general approach - try multiple strategies
379
  # Strategy 1: Look for semantic HTML5 elements
380
  main_content = None
381
- for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
 
382
  elements = soup.select(selector)
383
  if elements:
384
  main_content = elements[0]
385
  logger.info(f"Found content with selector: {selector}")
386
  break
387
-
388
  # Strategy 2: If no semantic elements, try common class names
389
  if not main_content or not main_content.get_text(strip=True):
390
  for div in soup.find_all('div'):
391
  class_name = div.get('class', [])
392
  id_name = div.get('id', '')
393
- if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
 
394
  main_content = div
395
  logger.info(f"Found content with div class: {class_name}")
396
  break
@@ -398,15 +333,15 @@ class URLProcessor:
398
  main_content = div
399
  logger.info(f"Found content with div id: {id_name}")
400
  break
401
-
402
  # Strategy 3: Fall back to body
403
  if not main_content or not main_content.get_text(strip=True):
404
  logger.info(f"No main content container found for {url}, using body")
405
  main_content = soup.body if soup.body else soup
406
-
407
  # Extract text with proper spacing
408
  text_content = main_content.get_text(separator='\n', strip=True)
409
-
410
  # Strategy 4: If content is too short, extract all visible text
411
  if len(text_content) < 100:
412
  logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
@@ -415,17 +350,16 @@ class URLProcessor:
415
  if element.get_text(strip=True):
416
  visible_text.append(element.get_text(strip=True))
417
  text_content = '\n'.join(visible_text)
418
-
419
  # Strategy 5: Last resort - get all text from the page
420
  if len(text_content) < 50:
421
  logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
422
  text_content = soup.get_text(separator='\n', strip=True)
423
-
424
  # Clean and structure content
425
  cleaned_content = self.advanced_text_cleaning(text_content)
426
-
427
  logger.info(f"Final content length: {len(cleaned_content)} chars")
428
-
429
  # If we still have no content, this is a failure
430
  if len(cleaned_content) < 20:
431
  logger.error(f"Failed to extract meaningful content from {url}")
@@ -441,6 +375,122 @@ class URLProcessor:
441
  logger.error(f"HTML processing failed for {url}: {e}")
442
  return None
443
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
  class FileProcessor:
446
  """Class to handle file processing"""
@@ -467,297 +517,4 @@ class FileProcessor:
467
  file_size = os.path.getsize(file.name)
468
  if file_size > self.max_file_size:
469
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
470
- return []
471
- with tempfile.TemporaryDirectory() as temp_dir:
472
- if zipfile.is_zipfile(file.name):
473
- dataset.extend(self._process_zip_file(file.name, temp_dir))
474
- else:
475
- dataset.extend(self._process_single_file(file))
476
- except Exception as e:
477
- logger.error(f"Error processing file: {str(e)}")
478
- return []
479
- return dataset
480
-
481
- def _process_zip_file(self, zip_path, temp_dir):
482
- """Extract and process files within a ZIP archive."""
483
- result = []
484
- with zipfile.ZipFile(zip_path, 'r') as zip_ref:
485
- zip_ref.extractall(temp_dir)
486
- for extracted_file in os.listdir(temp_dir):
487
- extracted_file_path = os.path.join(temp_dir, extracted_file)
488
- if os.path.isfile(extracted_file_path):
489
- with open(extracted_file_path, 'r', encoding='utf-8', errors='ignore') as f:
490
- result.append({
491
- 'source': 'file_from_zip',
492
- 'filename': extracted_file,
493
- 'content': f.read(),
494
- 'timestamp': datetime.now().isoformat()
495
- })
496
- return result
497
-
498
- def _process_single_file(self, file) -> List[Dict]:
499
- try:
500
- file_stat = os.stat(file.name)
501
-
502
- # For very large files, read in chunks and summarize
503
- if file_stat.st_size > 100 * 1024 * 1024: # 100MB
504
- logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
505
-
506
- # Read first and last 1MB for extremely large files
507
- content = ""
508
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
509
- content = f.read(1 * 1024 * 1024) # First 1MB
510
- content += "\n...[Content truncated due to large file size]...\n"
511
-
512
- # Seek to the last 1MB
513
- f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
514
- content += f.read() # Last 1MB
515
- else:
516
- # Regular file processing
517
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
518
- content = f.read()
519
-
520
- return [{
521
- 'source': 'file',
522
- 'filename': os.path.basename(file.name),
523
- 'file_size': file_stat.st_size,
524
- 'mime_type': mimetypes.guess_type(file.name)[0],
525
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
526
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
527
- 'content': content,
528
- 'timestamp': datetime.now().isoformat()
529
- }]
530
- except Exception as e:
531
- logger.error(f"File processing error: {e}")
532
- return []
533
-
534
- # Move process_all_inputs outside of the FileProcessor class
535
- def process_all_inputs(urls, file, text, notes):
536
- """Process all input types with progress tracking"""
537
- try:
538
- processor = URLProcessor()
539
- file_processor = FileProcessor()
540
- results = []
541
-
542
- # Process URLs
543
- if urls:
544
- url_list = re.split(r'[,\n]', urls)
545
- url_list = [url.strip() for url in url_list if url.strip()]
546
-
547
- for url in url_list:
548
- validation = processor.validate_url(url)
549
- if validation.get('is_valid'):
550
- content = processor.fetch_content(url)
551
- if content:
552
- results.append({
553
- 'source': 'url',
554
- 'url': url,
555
- 'content': content,
556
- 'timestamp': datetime.now().isoformat()
557
- })
558
-
559
- # Process files
560
- if file:
561
- results.extend(file_processor.process_file(file))
562
-
563
- # Process text input
564
- if text:
565
- cleaned_text = processor.advanced_text_cleaning(text)
566
- results.append({
567
- 'source': 'direct_input',
568
- 'content': cleaned_text,
569
- 'timestamp': datetime.now().isoformat()
570
- })
571
-
572
- # Generate output
573
- if results:
574
- output_dir = Path('output') / datetime.now().strftime('%Y-%m-%d')
575
- output_dir.mkdir(parents=True, exist_ok=True)
576
- output_path = output_dir / f'processed_{int(time.time())}.json'
577
-
578
- with open(output_path, 'w', encoding='utf-8') as f:
579
- json.dump(results, f, ensure_ascii=False, indent=2)
580
-
581
- summary = f"Processed {len(results)} items successfully!"
582
- json_data = json.dumps(results, indent=2) # Prepare JSON for QR code
583
- return str(output_path), summary, json_data # Return JSON for editor
584
- else:
585
- return None, "No valid content to process.", ""
586
-
587
- except Exception as e:
588
- logger.error(f"Processing error: {e}")
589
- return None, f"Error: {str(e)}", ""
590
-
591
- def generate_qr_code(json_data):
592
- """Generate QR code from JSON data and return the file path."""
593
- if json_data:
594
- return generate_qr(json_data)
595
-
596
- # Move generate_qr outside of the FileProcessor class as well
597
- def generate_qr(json_data):
598
- """Generate QR code from JSON data and return the file path."""
599
- try:
600
- # Try first with automatic version selection
601
- qr = qrcode.QRCode(
602
- error_correction=qrcode.constants.ERROR_CORRECT_L,
603
- box_size=10,
604
- border=4,
605
- )
606
- qr.add_data(json_data)
607
- qr.make(fit=True)
608
-
609
- img = qrcode.make_image(fill_color="black", back_color="white")
610
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
611
- img.save(temp_file.name)
612
- return temp_file.name
613
- except Exception as e:
614
- # If the data is too large for a QR code
615
- logger.error(f"QR generation error: {e}")
616
-
617
- # Create a simple QR with error message
618
- qr = qrcode.QRCode(
619
- version=1,
620
- error_correction=qrcode.constants.ERROR_CORRECT_L,
621
- box_size=10,
622
- border=4,
623
- )
624
- qr.add_data("Error: Data too large for QR code")
625
- qr.make(fit=True)
626
-
627
- img = qrcode.make_image(fill_color="black", back_color="white")
628
- temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
629
- img.save(temp_file.name)
630
- return temp_file.name
631
-
632
- def create_interface():
633
- """Create a comprehensive Gradio interface with advanced features"""
634
- css = """
635
- .container { max-width: 1200px; margin: auto; }
636
- .warning { background-color: #fff3cd; color: #856404; }
637
- .error { background-color: #f8d7da; color: #721c24; }
638
- """
639
-
640
- with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
641
- gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
642
-
643
- with gr.Tab("URL Processing"):
644
- url_input = gr.Textbox(
645
- label="Enter URLs (comma or newline separated)",
646
- lines=5,
647
- placeholder="https://example1.com\nhttps://example2.com"
648
- )
649
-
650
- with gr.Tab("File Input"):
651
- file_input = gr.File(
652
- label="Upload text file or ZIP archive",
653
- file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
654
- )
655
-
656
- with gr.Tab("Text Input"):
657
- text_input = gr.Textbox(
658
- label="Raw Text Input",
659
- lines=5,
660
- placeholder="Paste your text here..."
661
- )
662
-
663
- with gr.Tab("JSON Editor"):
664
- json_editor = gr.Textbox(
665
- label="JSON Editor",
666
- lines=20,
667
- placeholder="View and edit your JSON data here...",
668
- interactive=True,
669
- elem_id="json-editor" # Optional: for custom styling
670
- )
671
-
672
- with gr.Tab("Scratchpad"):
673
- scratchpad = gr.Textbox(
674
- label="Scratchpad",
675
- lines=10,
676
- placeholder="Quick notes or text collections...",
677
- interactive=True
678
- )
679
-
680
- process_btn = gr.Button("Process Input", variant="primary")
681
- qr_btn = gr.Button("Generate QR Code", variant="secondary")
682
-
683
- output_text = gr.Textbox(label="Processing Results", interactive=False)
684
- output_file = gr.File(label="Processed Output")
685
- qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
686
-
687
- process_btn.click(
688
- process_all_inputs,
689
- inputs=[url_input, file_input, text_input, scratchpad],
690
- outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
691
- )
692
-
693
- qr_btn.click(
694
- generate_qr_code,
695
- inputs=json_editor,
696
- outputs=qr_output
697
- )
698
-
699
- gr.Markdown("""
700
- ### Usage Guidelines
701
- - **URL Processing**: Enter valid HTTP/HTTPS URLs
702
- - **File Input**: Upload text files or ZIP archives
703
- - ** Text Input**: Direct text processing
704
- - **JSON Editor**: View and edit your JSON data
705
- - **Scratchpad**: Quick notes or text collections
706
- - Advanced cleaning and validation included
707
- """)
708
- return interface
709
-
710
- def check_network_connectivity():
711
- """Check if the network is working properly by testing connection to common sites"""
712
- test_sites = ["https://www.google.com", "https://www.cloudflare.com", "https://www.amazon.com"]
713
- results = []
714
-
715
- for site in test_sites:
716
- try:
717
- response = requests.get(site, timeout=5)
718
- results.append({
719
- "site": site,
720
- "status": "OK" if response.status_code == 200 else f"Error: {response.status_code}",
721
- "response_time": response.elapsed.total_seconds()
722
- })
723
- except Exception as e:
724
- results.append({
725
- "site": site,
726
- "status": f"Error: {str(e)}",
727
- "response_time": None
728
- })
729
-
730
- # If all sites failed, there might be a network issue
731
- if all(result["status"].startswith("Error") for result in results):
732
- logger.error("Network connectivity issue detected. All test sites failed.")
733
- return False, results
734
-
735
- return True, results
736
-
737
- # Add this to the main function
738
- def main():
739
- # Configure system settings
740
- mimetypes.init()
741
-
742
- # Check network connectivity
743
- network_ok, network_results = check_network_connectivity()
744
- if not network_ok:
745
- logger.warning("Network connectivity issues detected. Some features may not work properly.")
746
- for result in network_results:
747
- logger.warning(f"Test site {result['site']}: {result['status']}")
748
-
749
- # Create and launch interface
750
- interface = create_interface()
751
-
752
- # Launch with proper configuration
753
- interface.launch(
754
- server_name="0.0.0.0",
755
- server_port=7860,
756
- show_error=True,
757
- share=False,
758
- inbrowser=True,
759
- debug=True
760
- )
761
-
762
- if __name__ == "__main__":
763
- main()
 
72
  try:
73
  if not validators.url(url):
74
  return {'is_valid': False, 'message': 'Invalid URL format'}
 
75
  # Try with DNS resolution retry
76
  for attempt in range(3): # Try up to 3 times
77
  try:
 
86
  response.raise_for_status()
87
  # Close the connection to avoid downloading the entire content
88
  response.close()
89
+
90
  return {'is_valid': True, 'message': 'URL is valid and accessible'}
91
  except requests.exceptions.ConnectionError as e:
92
  if "NameResolutionError" in str(e) or "Failed to resolve" in str(e):
93
+ logger.warning(f"DNS resolution failed for {url}, attempt {attempt + 1}/3")
94
  time.sleep(1) # Wait a bit before retrying
95
  continue
96
  else:
97
  raise
98
  except Exception as e:
99
  raise
 
100
  # If we get here, all attempts failed
101
+ return {'is_valid': False,
102
+ 'message': f'URL validation failed: DNS resolution failed after multiple attempts'}
103
  except Exception as e:
104
  logger.error(f"URL validation failed for {url}: {str(e)}")
105
  return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
 
108
  """Universal content fetcher with special case handling"""
109
  try:
110
  logger.info(f"Fetching content from: {url}")
111
+
112
  # Google Drive document handling
113
  if 'drive.google.com' in url:
114
  return self._handle_google_drive(url)
 
115
  # Google Calendar ICS handling
116
  if 'calendar.google.com' in url and 'ical' in url:
117
  return self._handle_google_calendar(url)
 
118
  # Try standard HTML processing first
119
  result = self._fetch_html_content(url)
120
+
121
  # If standard processing failed or returned minimal content, try with Selenium
122
  if not result or len(result.get('content', '')) < 100:
123
+ logger.info(
124
+ f"Standard processing failed or returned minimal content for {url}, trying Selenium")
125
  selenium_html = self._fetch_with_selenium(url)
 
126
  if selenium_html:
127
  # Process the Selenium HTML
128
  soup = BeautifulSoup(selenium_html, 'html.parser')
 
129
  # Remove unwanted elements
130
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
131
  element.decompose()
132
+
133
  # Apply the same content extraction strategies as in _fetch_html_content
134
  # Strategy 1: Look for semantic HTML5 elements
135
  main_content = None
136
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post',
137
+ '.entry', '.page']:
138
  elements = soup.select(selector)
139
  if elements:
140
  main_content = elements[0]
141
  logger.info(f"Found content with selector: {selector}")
142
  break
143
+
144
  # If no main content found, use body
145
  if not main_content or not main_content.get_text(strip=True):
146
  main_content = soup.body if soup.body else soup
147
+
148
  # Extract text
149
  text_content = main_content.get_text(separator='\n', strip=True)
150
+
151
  # Clean content
152
  cleaned_content = self.advanced_text_cleaning(text_content)
153
+
154
  if len(cleaned_content) >= 20:
155
  result = {
156
  'content': cleaned_content,
 
159
  'url': url,
160
  'source': 'selenium' # Mark that this came from Selenium
161
  }
162
+ # Log the result status
 
163
  if result:
164
  logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
165
  else:
166
  logger.error(f"Failed to extract content from {url}")
 
167
  return result
168
  except Exception as e:
169
  logger.error(f"Content fetch failed for {url}: {e}")
170
  return None
171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
173
  """Standard HTML content processing"""
174
  try:
175
  # Try with a different user agent if it's a social media site
176
+ if any(domain in url for domain in
177
+ ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
178
  # Use a more realistic browser user agent instead of random one
179
  self.session.headers.update({
180
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
 
194
  if 'facebook.com' in url and 'm.facebook.com' not in url:
195
  url = url.replace('www.facebook.com', 'm.facebook.com')
196
  logger.info(f"Switched to mobile Facebook URL: {url}")
197
+
198
+ # Add a delay to simulate human browsing
199
+ time.sleep(1)
200
+
201
  # Try to get the page with multiple attempts
202
  max_attempts = 3
203
  for attempt in range(max_attempts):
 
207
  break
208
  except (requests.exceptions.RequestException, Exception) as e:
209
  if attempt < max_attempts - 1:
210
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}. Retrying...")
211
  time.sleep(2) # Wait longer between retries
212
  else:
213
  raise
 
214
  logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
215
+
216
  # Save the raw HTML for debugging if needed
217
  debug_path = f"/Users/a2014/urld/debug_raw_{int(time.time())}.html"
218
  with open(debug_path, "w", encoding="utf-8") as f:
219
  f.write(response.text)
220
  logger.info(f"Saved raw HTML to {debug_path}")
221
+
222
  # Check if we got a valid response with content
223
  if not response.text or len(response.text) < 100:
224
  logger.error(f"Empty or very short response from {url}")
225
  return None
 
 
226
 
227
+ soup = BeautifulSoup(response.text, 'html.parser')
228
  # Remove unwanted elements
229
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
230
  element.decompose()
231
+
232
  # Simulate "ESC key" by removing login walls and overlays common on social media sites
233
  login_wall_selectors = [
234
+ '.login-wall', '.signup-wall', '.overlay', '.modal',
235
+ '[role="dialog"]', '[aria-modal="true"]', '.login-overlay',
236
  '.signup-overlay', '.uiLayer', '.fb_overlay', '.ReactModalPortal',
237
  '[data-testid="login_dialog"]', '[data-testid="signup_dialog"]',
238
  '.login-signup-modal', '.onboarding-modal', '.signup-wrapper',
 
243
  for element in soup.select(selector):
244
  logger.info(f"Removing login wall element: {selector}")
245
  element.decompose()
246
+
247
  # Enhanced removal for social media sites
248
  if 'facebook.com' in url:
249
  # Facebook specific elements - simulating ESC key
250
  fb_selectors = [
251
+ '[data-testid="cookie-policy-manage-dialog"]',
252
+ '[role="banner"]', '[role="complementary"]',
253
+ '.login_form_container', '.login_form', '#login_form',
254
+ '.uiLayer', '.pluginConnectButton', '.fbPageBanner',
255
+ '._5hn6', '._67m7', '.nonLoggedInSignUp',
256
  '#headerArea', '.uiContextualLayer', '.uiContextualLayerPositioner'
257
  ]
258
  for selector in fb_selectors:
259
  for element in soup.select(selector):
260
  element.decompose()
261
+
262
  # Look for the main content in mobile version
263
+ main_content = soup.select_one('#m_story_permalink_view') or soup.select_one(
264
+ '#mobile_injected_video_feed_pagelet')
265
  if main_content:
266
  logger.info("Found Facebook mobile main content")
267
+
268
  elif 'instagram.com' in url:
269
  # Instagram specific elements - simulating ESC key
270
  ig_selectors = [
271
+ '[role="presentation"]', '[role="banner"]', '[role="complementary"]',
272
+ '.RnEpo', '._acb3', '._ab8w', '._abn5', '.x1n2onr6',
273
+ '.x78zum5', '.x1q0g3np', '.xieb3on', '._a9-z', '._a9_1',
274
  '._aa4b', '.x1i10hfl', '.x9f619', '.xnz67gz', '.x78zum5',
275
  '.x1q0g3np', '.x1gslohp', '.xieb3on', '.x1lcm9me'
276
  ]
277
  for selector in ig_selectors:
278
  for element in soup.select(selector):
279
  element.decompose()
280
+
281
  # Try to find the main content
282
+ insta_content = soup.select_one('main article') or soup.select_one('._aagv') or soup.select_one(
283
+ '._ab1y')
284
  if insta_content:
285
  logger.info("Found Instagram main content")
286
+
287
  elif 'twitter.com' in url or 'x.com' in url:
288
  # X/Twitter already works well for public content, but clean up any remaining overlays
289
  x_selectors = [
290
  '[data-testid="LoginForm"]', '[data-testid="SignupForm"]',
291
  '[data-testid="sheetDialog"]', '[data-testid="mask"]',
292
+ '.r-zchlnj', '.r-1xcajam', '.r-1d2f490', '.r-1p0dtai',
293
+ '.r-1pi2tsx', '.r-u8s1d', '.css-175oi2r', '.css-1dbjc4n',
294
  '.r-kemksi', '[data-testid="BottomBar"]'
295
  ]
296
  for selector in x_selectors:
297
  for element in soup.select(selector):
298
  element.decompose()
299
+
300
  elif 'huggingface.co' in url:
301
  # Special handling for Hugging Face
302
  logger.info("Applying special handling for Hugging Face")
303
  # Try to find the main content
304
+ hf_selectors = ['.prose', '.space-content', '.model-description',
305
+ '.dataset-description', 'article', '.markdown']
306
  for selector in hf_selectors:
307
  elements = soup.select(selector)
308
  if elements:
309
  logger.info(f"Found Hugging Face content with selector: {selector}")
310
  break
 
311
  # Extract content using a general approach - try multiple strategies
312
  # Strategy 1: Look for semantic HTML5 elements
313
  main_content = None
314
+ for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry',
315
+ '.page']:
316
  elements = soup.select(selector)
317
  if elements:
318
  main_content = elements[0]
319
  logger.info(f"Found content with selector: {selector}")
320
  break
321
+
322
  # Strategy 2: If no semantic elements, try common class names
323
  if not main_content or not main_content.get_text(strip=True):
324
  for div in soup.find_all('div'):
325
  class_name = div.get('class', [])
326
  id_name = div.get('id', '')
327
+ if any(term in ' '.join(class_name).lower() for term in
328
+ ['content', 'main', 'body', 'article', 'post']):
329
  main_content = div
330
  logger.info(f"Found content with div class: {class_name}")
331
  break
 
333
  main_content = div
334
  logger.info(f"Found content with div id: {id_name}")
335
  break
336
+
337
  # Strategy 3: Fall back to body
338
  if not main_content or not main_content.get_text(strip=True):
339
  logger.info(f"No main content container found for {url}, using body")
340
  main_content = soup.body if soup.body else soup
341
+
342
  # Extract text with proper spacing
343
  text_content = main_content.get_text(separator='\n', strip=True)
344
+
345
  # Strategy 4: If content is too short, extract all visible text
346
  if len(text_content) < 100:
347
  logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
 
350
  if element.get_text(strip=True):
351
  visible_text.append(element.get_text(strip=True))
352
  text_content = '\n'.join(visible_text)
353
+
354
  # Strategy 5: Last resort - get all text from the page
355
  if len(text_content) < 50:
356
  logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
357
  text_content = soup.get_text(separator='\n', strip=True)
 
358
  # Clean and structure content
359
  cleaned_content = self.advanced_text_cleaning(text_content)
360
+
361
  logger.info(f"Final content length: {len(cleaned_content)} chars")
362
+
363
  # If we still have no content, this is a failure
364
  if len(cleaned_content) < 20:
365
  logger.error(f"Failed to extract meaningful content from {url}")
 
375
  logger.error(f"HTML processing failed for {url}: {e}")
376
  return None
377
 
378
+ def _handle_google_drive(self, url: str) -> Optional[Dict]:
379
+ """Handle Google Drive document URLs"""
380
+ try:
381
+ # Construct direct download URL
382
+ file_id = url.split("/d/")[1].split("/")[0]
383
+ download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
384
+ response = self.session.get(download_url, stream=True, timeout=self.timeout)
385
+ response.raise_for_status()
386
+
387
+ # Read content (limit to the first 1MB)
388
+ content = b""
389
+ for chunk in response.iter_content(chunk_size=8192): # 8KB chunks
390
+ content += chunk
391
+ if len(content) > 1024 * 1024: # 1MB limit
392
+ content = content[:1024 * 1024]
393
+ logger.warning(f"Truncated Google Drive file after 1MB")
394
+ break
395
+ text_content = content.decode('utf-8', errors='ignore')
396
+ cleaned_text = self.advanced_text_cleaning(text_content)
397
+
398
+ return {
399
+ 'content': cleaned_text,
400
+ 'content_type': 'text/plain', # Assume plain text for simplicity
401
+ 'timestamp': datetime.now().isoformat(),
402
+ 'url': url,
403
+ 'source': 'google_drive'
404
+ }
405
+ except Exception as e:
406
+ logger.error(f"Error handling Google Drive URL {url}: {e}")
407
+ return None
408
+
409
+ def _handle_google_calendar(self, url: str) -> Optional[Dict]:
410
+ """Handle Google Calendar ICS URLs"""
411
+ try:
412
+ response = self.session.get(url, timeout=self.timeout)
413
+ response.raise_for_status()
414
+ text_content = response.text
415
+ cleaned_text = self.advanced_text_cleaning(text_content)
416
+ return {
417
+ 'content': cleaned_text,
418
+ 'content_type': 'text/calendar', # Correct MIME type
419
+ 'timestamp': datetime.now().isoformat(),
420
+ 'url': url,
421
+ 'source': 'google_calendar'
422
+ }
423
+ except Exception as e:
424
+ logger.error(f"Error handling Google Calendar URL {url}: {e}")
425
+ return None
426
+
427
+ def _fetch_with_selenium(self, url: str) -> Optional[str]:
428
+ """Use Selenium as a fallback for difficult sites"""
429
+ try:
430
+ from selenium import webdriver
431
+ from selenium.webdriver.chrome.options import Options
432
+ from selenium.webdriver.common.by import By
433
+ from selenium.webdriver.support.ui import WebDriverWait
434
+ from selenium.webdriver.support import expected_conditions as EC
435
+ from selenium.common.exceptions import TimeoutException
436
+ import time
437
+
438
+ logger.info(f"Attempting to fetch {url} with Selenium")
439
+
440
+ # Set up Chrome options
441
+ chrome_options = Options()
442
+ chrome_options.add_argument("--headless")
443
+ chrome_options.add_argument("--no-sandbox")
444
+ chrome_options.add_argument("--disable-dev-shm-usage")
445
+ chrome_options.add_argument("--disable-gpu")
446
+ chrome_options.add_argument("--window-size=1920,1080")
447
+ chrome_options.add_argument(
448
+ "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
449
+
450
+ # Initialize the driver
451
+ driver = webdriver.Chrome(options=chrome_options)
452
+
453
+ try:
454
+ # Navigate to the URL
455
+ driver.get(url)
456
+
457
+ # Wait for the page to load
458
+ WebDriverWait(driver, 10).until(
459
+ EC.presence_of_element_located((By.TAG_NAME, "body"))
460
+ )
461
+
462
+ # Simulate pressing ESC key to dismiss overlays
463
+ from selenium.webdriver.common.keys import Keys
464
+ action_chains = webdriver.ActionChains(driver)
465
+ action_chains.send_keys(Keys.ESCAPE).perform()
466
+ time.sleep(1) # give it a moment to take effect
467
+ action_chains.reset_actions() # Clear actions
468
+
469
+ # try again
470
+ action_chains.send_keys(Keys.ESCAPE).perform()
471
+ time.sleep(1) # give it a moment to take effect
472
+ action_chains.reset_actions()
473
+
474
+ # Get the page source
475
+ page_source = driver.page_source
476
+
477
+ # Save the Selenium HTML for debugging
478
+ debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
479
+ with open(debug_path, "w", encoding="utf-8") as f:
480
+ f.write(page_source)
481
+ logger.info(f"Saved Selenium HTML to {debug_path}")
482
+
483
+ return page_source
484
+ finally:
485
+ driver.quit()
486
+
487
+ except ImportError:
488
+ logger.error("Selenium is not installed. Cannot use browser automation.")
489
+ return None
490
+ except Exception as e:
491
+ logger.error(f"Selenium processing failed for {url}: {e}")
492
+ return None
493
+
494
 
495
  class FileProcessor:
496
  """Class to handle file processing"""
 
517
  file_size = os.path.getsize(file.name)
518
  if file_size > self.max_file_size:
519
  logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
520
+ ret