acecalisto3 commited on
Commit
5798d9f
·
verified ·
1 Parent(s): e1fd662

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +743 -492
app2.py CHANGED
@@ -1,126 +1,120 @@
 
 
 
 
1
  import json
2
- import os
3
- import re
4
  import logging
5
  import mimetypes
6
- import time
7
  from PIL import Image
8
- import zxing
9
- import io
10
- import zipfile
11
- import tempfile
12
- from datetime import datetime
13
- from typing import List, Dict, Optional, Union, Any
14
- from pathlib import Path
15
  import requests
 
 
16
  import validators
17
- import gradio as gr
 
18
  from bs4 import BeautifulSoup
19
- from fake_useragent import UserAgent
20
  from cleantext import clean
21
- import qrcode
22
- import cv2 # Add this import for the decode_qr_code function
23
- # Setup logging
24
- import sys
25
- import argparse
26
- import base64
27
- import io
28
- logging.basicConfig(
29
- level=logging.INFO,
30
- format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
31
- handlers=[
32
- logging.StreamHandler(),
33
- logging.FileHandler('app.log', encoding='utf-8')
34
- ]
35
- )
36
- logger = logging.getLogger(__name__)
37
-
38
- # Ensure output directories exist
39
- Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
40
 
 
 
 
 
 
 
 
41
  class URLProcessor:
42
- def __init__(self):
43
- self.session = requests.Session()
44
- self.timeout = 10
45
- self.max_retries = 3
46
- self.request_delay = 1.0
47
- self.respect_robots = True
48
- self.use_proxy = False
49
- self.proxy_url = None
50
- self.rate_limits = {} # Track rate limits per domain
51
- self.selenium_driver = None
52
 
53
- # Update session headers with rotating user agents
 
54
  self.update_user_agent()
55
 
56
- if self.use_proxy and self.proxy_url:
57
- self.session.proxies = {
58
- 'http': self.proxy_url,
59
- 'https': self.proxy_url
60
- }
61
 
62
  def update_user_agent(self):
63
- """Rotate user agents to avoid detection"""
64
- try:
65
- self.session.headers.update({
66
- 'User-Agent': UserAgent().random,
67
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
68
- 'Accept-Language': 'en-US,en;q=0.5',
69
- 'Accept-Encoding': 'gzip, deflate, br',
70
- 'Connection': 'keep-alive',
71
- 'Upgrade-Insecure-Requests': '1',
72
- 'Cache-Control': 'max-age=0'
73
- })
74
- except Exception as e:
75
- logger.warning(f"Failed to update user agent: {e}")
76
- # Fallback to a common user agent
77
- self.session.headers.update({
78
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
79
- })
80
 
81
  def get_selenium_driver(self):
82
- """Initialize Selenium WebDriver for interactive sites"""
83
- if self.selenium_driver is not None:
84
- return self.selenium_driver
85
 
86
  try:
87
- from selenium import webdriver
88
- from selenium.webdriver.chrome.service import Service
89
  from selenium.webdriver.chrome.options import Options
 
90
  from webdriver_manager.chrome import ChromeDriverManager
91
 
92
  options = Options()
93
- options.add_argument("--headless")
94
- options.add_argument("--no-sandbox")
95
- options.add_argument("--disable-dev-shm-usage")
96
- options.add_argument(f"user-agent={self.session.headers['User-Agent']}")
97
- options.add_argument("--disable-notifications")
98
- options.add_argument("--disable-popup-blocking")
99
- options.add_argument("--disable-extensions")
100
 
101
  service = Service(ChromeDriverManager().install())
102
- self.selenium_driver = webdriver.Chrome(service=service, options=options)
103
- return self.selenium_driver
104
  except Exception as e:
105
  logger.error(f"Failed to initialize Selenium: {e}")
106
  return None
107
 
108
- def handle_rate_limits(self, domain):
109
- """Smart rate limiting based on domain"""
110
- from urllib.parse import urlparse
111
- import time
112
-
113
- # Extract domain from URL
114
- parsed_domain = urlparse(domain).netloc
 
 
 
115
 
116
- # Check if we've accessed this domain recently
117
  current_time = time.time()
118
  if parsed_domain in self.rate_limits:
119
  last_access, count = self.rate_limits[parsed_domain]
120
 
121
- # Different delay strategies for different domains
122
- if "facebook" in parsed_domain or "instagram" in parsed_domain:
123
- min_delay = 5.0 # Longer delay for social media sites
 
124
  elif "gov" in parsed_domain:
125
  min_delay = 2.0 # Be respectful with government sites
126
  else:
@@ -216,59 +210,6 @@ class URLProcessor:
216
  except Exception as e:
217
  logger.warning(f"Error handling Google site: {e}")
218
 
219
- def fetch_content(self, url: str) -> Optional[Dict]:
220
- """Fetch content with smart handling for different sites"""
221
- # Check if URL is allowed by robots.txt
222
- if self.respect_robots and not self.check_robots_txt(url):
223
- logger.warning(f"URL {url} is disallowed by robots.txt")
224
- return None
225
-
226
- # Apply rate limiting
227
- self.handle_rate_limits(url)
228
-
229
- # Rotate user agent occasionally
230
- if random.random() < 0.3: # 30% chance to rotate
231
- self.update_user_agent()
232
-
233
- # Determine if site needs special handling
234
- needs_selenium = any(domain in url.lower() for domain in [
235
- 'facebook.com', 'instagram.com', 'linkedin.com',
236
- 'google.com/search', 'twitter.com', 'x.com'
237
- ])
238
-
239
- for attempt in range(self.max_retries):
240
- try:
241
- if needs_selenium:
242
- return self.handle_interactive_site(url)
243
-
244
- # Try with cloudscraper first for sites with anti-bot measures
245
- if any(domain in url.lower() for domain in ['cloudflare', '.gov']):
246
- import cloudscraper
247
- scraper = cloudscraper.create_scraper(
248
- browser={'browser': 'chrome', 'platform': 'darwin', 'mobile': False}
249
- )
250
- response = scraper.get(url, timeout=self.timeout)
251
- else:
252
- # Standard request for most sites
253
- response = self.session.get(url, timeout=self.timeout)
254
-
255
- response.raise_for_status()
256
-
257
- return {
258
- 'content': response.text,
259
- 'content_type': response.headers.get('Content-Type', ''),
260
- 'url': url,
261
- 'status_code': response.status_code
262
- }
263
- except Exception as e:
264
- logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
265
- if attempt < self.max_retries - 1:
266
- # Exponential backoff
267
- time.sleep(self.request_delay * (2 ** attempt))
268
-
269
- logger.error(f"All attempts failed for {url}")
270
- return None
271
-
272
  def check_robots_txt(self, url: str) -> bool:
273
  """Check if URL is allowed by robots.txt"""
274
  if not self.respect_robots:
@@ -290,65 +231,6 @@ class URLProcessor:
290
  logger.warning(f"Error checking robots.txt: {e}")
291
  return True
292
 
293
- def fetch_content(self, url: str) -> Optional[Dict]:
294
- """Fetch content with built-in rate limiting and robots.txt checking"""
295
- if not self.check_robots_txt(url):
296
- logger.warning(f"URL {url} is disallowed by robots.txt")
297
- return None
298
-
299
- time.sleep(self.request_delay) # Basic rate limiting
300
-
301
- for attempt in range(self.max_retries):
302
- try:
303
- if 'drive.google.com' in url:
304
- return self._handle_google_drive(url)
305
- if 'calendar.google.com' in url:
306
- return self._handle_google_calendar(url)
307
- return self._fetch_html_content(url)
308
- except Exception as e:
309
- logger.error(f"Attempt {attempt + 1} failed: {e}")
310
- if attempt < self.max_retries - 1:
311
- time.sleep(self.request_delay * (attempt + 1))
312
-
313
- return None
314
-
315
- def advanced_text_cleaning(self, text: str) -> str:
316
- """Robust text cleaning with version compatibility"""
317
- try:
318
- cleaned_text = clean(
319
- text,
320
- fix_unicode=True,
321
- to_ascii=True,
322
- lower=True,
323
- no_line_breaks=True,
324
- no_urls=True,
325
- no_emails=True,
326
- no_phone_numbers=True,
327
- no_numbers=False,
328
- no_digits=False,
329
- no_currency_symbols=True,
330
- no_punct=False
331
- ).strip()
332
- return cleaned_text
333
- except Exception as e:
334
- logger.warning(f"Text cleaning error: {e}. Using fallback method.")
335
- text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
336
- text = text.encode('ascii', 'ignore').decode('ascii')
337
- text = re.sub(r'\s+', ' ', text)
338
- return text.strip()
339
-
340
- def validate_url(self, url: str) -> Dict:
341
- """Validate URL format and accessibility"""
342
- try:
343
- if not validators.url(url):
344
- return {'is_valid': False, 'message': 'Invalid URL format'}
345
-
346
- response = self.session.head(url, timeout=self.timeout)
347
- response.raise_for_status()
348
- return {'is_valid': True, 'message': 'URL is valid and accessible'}
349
- except Exception as e:
350
- return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
351
-
352
  def fetch_content(self, url: str) -> Optional[Dict]:
353
  """Universal content fetcher with special case handling"""
354
  try:
@@ -397,30 +279,90 @@ class URLProcessor:
397
  return None
398
 
399
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
400
- """Standard HTML content processing"""
401
  try:
402
  response = self.session.get(url, timeout=self.timeout)
403
  response.raise_for_status()
404
 
 
 
 
 
405
  soup = BeautifulSoup(response.text, 'html.parser')
406
- for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
407
- element.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
408
 
409
- main_content = soup.find('main') or soup.find('article') or soup.body
 
 
 
 
410
 
411
- if main_content is None:
412
- logger.warning(f"No main content found for URL: {url}")
413
- return {
414
- 'content': '',
415
- 'content_type': response.headers.get('Content-Type', ''),
416
- 'timestamp': datetime.now().isoformat()
417
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
 
419
- text_content = main_content.get_text(separator='\n', strip=True)
420
- cleaned_content = self.advanced_text_cleaning(text_content)
421
 
 
422
  return {
423
- 'content': cleaned_content,
 
 
 
 
 
 
 
 
424
  'content_type': response.headers.get('Content-Type', ''),
425
  'timestamp': datetime.now().isoformat()
426
  }
@@ -428,6 +370,146 @@ class URLProcessor:
428
  logger.error(f"HTML processing failed: {e}")
429
  return None
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  class FileProcessor:
432
  """Class to handle file processing with enhanced capabilities"""
433
 
@@ -679,6 +761,7 @@ class FileProcessor:
679
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
680
  logger.info(f"Processing large file: {file_path} ({file_stat.st_size} bytes)")
681
 
 
682
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
683
  content = f.read(1 * 1024 * 1024) # First 1MB
684
  content += "\n...[Content truncated due to large file size]...\n"
@@ -686,312 +769,480 @@ class FileProcessor:
686
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
687
  content += f.read() # Last 1MB
688
  else:
689
- with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
690
  content = f.read()
 
 
 
 
 
 
 
 
 
 
 
691
  else:
692
- # For binary files, just record metadata
693
- content = f"[Binary file: {mime_type or 'unknown type'}]"
694
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
695
  return [{
696
- 'source': 'file',
697
- 'filename': os.path.basename(file.name),
698
- 'file_size': file_stat.st_size,
699
- 'mime_type': mimetypes.guess_type(file.name)[0],
700
- 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
701
- 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
702
- 'content': content,
703
  'timestamp': datetime.now().isoformat()
704
  }]
705
- except Exception as e:
706
- logger.error(f"File processing error: {e}")
707
- return []
708
 
709
- def clean_json(self, data: Union[str, Dict]) -> Optional[Dict]:
710
- """Clean and validate JSON data"""
711
  try:
712
- if isinstance(data, str):
713
- data = data.strip()
714
- data = json.loads(data)
715
-
716
- cleaned = json.loads(json.dumps(data))
717
- return cleaned
718
- except json.JSONDecodeError as e:
719
- logger.error(f"JSON cleaning error: {e}")
720
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  except Exception as e:
722
- logger.error(f"Unexpected error while cleaning JSON: {e}")
723
- return None
724
-
725
- def generate_qr_code(self, data: Union[str, Dict], combined: bool = True) -> List[str]:
726
- """Generate QR code(s) from data"""
 
 
 
 
 
727
  try:
728
- output_dir = Path('output/qr_codes')
729
- output_dir.mkdir(parents=True, exist_ok=True)
730
-
731
- if combined:
732
- cleaned_data = self.clean_json(data)
733
- if cleaned_data:
734
- qr = qrcode.QRCode(
735
- version=None,
736
- error_correction=qrcode.constants.ERROR_CORRECT_L,
737
- box_size=10,
738
- border=4,
739
- )
740
- json_str = json.dumps(cleaned_data, ensure_ascii=False)
741
- qr.add_data(json_str)
742
- qr.make(fit=True)
743
-
744
- img = qr.make_image(fill_color="black", back_color="white")
745
- output_path = output_dir / f'combined_qr_{int(time.time())}.png'
746
- img.save(str(output_path))
747
- return [str(output_path)]
748
- else:
749
- if isinstance(data, list):
750
- paths = []
751
- for idx, item in enumerate(data):
752
- cleaned_item = self.clean_json(item)
753
- if cleaned_item:
754
- qr = qrcode.QRCode(
755
- version=None,
756
- error_correction=qrcode.constants.ERROR_CORRECT_L,
757
- box_size=10,
758
- border=4,
759
- )
760
- json_str = json.dumps(cleaned_item, ensure_ascii=False)
761
- qr.add_data(json_str)
762
- qr.make(fit=True)
763
-
764
- img = qrcode.make_image(fill_color="black", back_color="white")
765
- output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
766
- img.save(str(output_path))
767
- paths.append(str(output_path))
768
- return paths
769
- else:
770
- cleaned_item = self.clean_json(data)
771
- if cleaned_item:
772
- qr = qrcode.QRCode(
773
- version=None,
774
- error_correction=qrcode.constants.ERROR_CORRECT_L,
775
- box_size=10,
776
- border=4,
777
- )
778
- json_str = json.dumps(cleaned_item, ensure_ascii=False)
779
- qr.add_data(json_str)
780
- qr.make(fit=True)
781
-
782
- img = qrcode.make_image(fill_color="black", back_color="white")
783
- output_path = output_dir / f'single_qr_{int(time.time())}.png'
784
- img.save(str(output_path))
785
- return [str(output_path)]
786
 
787
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
788
  except Exception as e:
789
- logger.error(f"QR generation error: {e}")
790
- return []
791
- def decode_qr_code(image_path: str) -> Optional[str]:
792
- """Decode QR code from an image file using OpenCV with improved binary handling"""
793
- try:
794
- # Read image using OpenCV
795
- img = cv2.imread(image_path)
796
- if img is None:
797
- logger.error(f"Failed to read image: {image_path}")
798
- return None
 
 
 
 
 
 
 
799
 
800
- # Convert to grayscale
801
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
802
-
803
- # Initialize QRCode detector
804
- detector = cv2.QRCodeDetector()
805
-
806
- # Detect and decode
807
- data, vertices, _ = detector.detectAndDecode(gray)
808
-
809
- if vertices is not None and data:
810
- # Check if this might be binary data (like a PDF)
811
- if data.startswith("%PDF") or not all(ord(c) < 128 for c in data):
812
- # This is likely binary data, encode as base64
813
- try:
814
- # If it's already a string representation, convert to bytes first
815
- if isinstance(data, str):
816
- data_bytes = data.encode('latin-1') # Use latin-1 to preserve byte values
817
- else:
818
- data_bytes = data
819
-
820
- # Encode as base64
821
- base64_data = base64.b64encode(data_bytes).decode('ascii')
822
- return f"base64:{base64_data}"
823
- except Exception as e:
824
- logger.error(f"Error encoding binary data: {e}")
825
 
826
- return data
827
-
828
- logger.warning("No QR code found in image")
829
- return None
830
- except Exception as e:
831
- logger.error(f"QR decoding error: {e}")
832
- return None
833
-
834
- # Also update the datachat_interface function to handle base64 data
835
- def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
836
- """Interface for DataChat functionality with binary data support"""
837
- data = None
838
- if data_source == "JSON Input":
839
- data = json_input
840
- elif data_source == "QR Code":
841
- try:
842
- decoded_data = decode_qr_code(qr_image)
843
 
844
- # Handle base64 encoded data
845
- if decoded_data and decoded_data.startswith("base64:"):
846
- base64_part = decoded_data[7:] # Remove the "base64:" prefix
847
- try:
848
- # For PDFs and other binary data, provide info about the content
849
- binary_data = base64.b64decode(base64_part)
850
- if binary_data.startswith(b"%PDF"):
851
- data = "The QR code contains a PDF document. Binary data cannot be processed directly."
852
- else:
853
- # Try to decode as text as a fallback
854
- data = binary_data.decode('utf-8', errors='replace')
855
- except Exception as e:
856
- logger.error(f"Error processing base64 data: {e}")
857
- data = "The QR code contains binary data that cannot be processed directly."
858
- else:
859
- data = decoded_data
860
 
861
- if not data:
862
- return "No QR code found in the provided image."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  except Exception as e:
864
- return f"Invalid QR code data provided: {e}"
865
- else:
866
- return "No valid data source selected."
867
-
868
- if mode == "Trained with Data":
869
- return datachat_trained(data, query)
870
- elif mode == "Chat about Data":
871
- return datachat_simple(data, query)
872
- else:
873
- return "Invalid mode selected."
874
-
875
- # Replace the create_interface function with this version
876
- def create_interface():
877
- """Create a comprehensive Gradio interface with advanced features"""
878
- css = """
879
- .container { max-width: 1200px; margin: auto; }
880
- .warning { background-color: #fff3cd; color: #856404; padding: 10px; border-radius: 4px; }
881
- .error { background-color: #f8d7da; color: #721c24; padding: 10px; border-radius: 4px; }
882
- .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
883
- """
884
 
885
- # Use Interface instead of Blocks
886
- demo = gr.Interface(
887
- fn=datachat_interface,
888
- inputs=[
889
- gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
890
- gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
891
- gr.Textbox(lines=8, label="JSON Data"),
892
- gr.Image(label="QR Code Image", type="filepath"),
893
- gr.Textbox(label="Query")
894
- ],
895
- outputs=gr.Textbox(label="Response"),
896
- title="Advanced Data Processor & QR Code Generator",
897
- description="# 🌐 Advanced Data Processing & QR Code Generator",
898
- css=css
899
- )
900
 
901
- return interface
902
-
903
- def main():
904
- """Main entry point for the application"""
905
- parser = argparse.ArgumentParser(description='URL and File Processor')
906
- parser.add_argument('--mode', choices=['web', 'cli'], default='web', help='Run mode (web interface or CLI)')
907
- parser.add_argument('--url', help='URL to process (CLI mode)')
908
- parser.add_argument('--file', help='File to process (CLI mode)')
909
- parser.add_argument('--output', help='Output directory for results (CLI mode)')
910
- parser.add_argument('--share', action='store_true', help='Share the web interface publicly (web mode)')
911
- parser.add_argument('--check-deps', action='store_true', help='Check dependencies and install missing ones')
 
 
 
 
912
 
913
- args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
914
 
915
- # Check dependencies if requested
916
- if args.check_deps:
917
- from utils import check_dependencies, install_missing_dependencies
918
-
919
- logger.info("Checking dependencies...")
920
- deps = check_dependencies()
921
-
922
- missing = [pkg for pkg, installed in deps.items() if not installed]
923
- if missing:
924
- logger.info(f"Missing dependencies: {', '.join(missing)}")
925
- if input("Install missing dependencies? (y/n): ").lower() == 'y':
926
- install_missing_dependencies(missing)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
927
  else:
928
- logger.warning("Some features may not work without required dependencies.")
929
- else:
930
- logger.info("All dependencies are installed.")
 
 
 
 
 
 
 
 
 
 
 
931
 
932
- # Run in web mode
933
- if args.mode == 'web':
934
  try:
935
- import gradio
936
- except ImportError:
937
- logger.error("Gradio is required for web mode. Install with 'pip install gradio'")
938
- sys.exit(1)
939
-
940
- from interface import Interface
941
-
942
- logger.info("Starting web interface...")
943
- interface = Interface()
944
- interface.launch(share=args.share)
945
-
946
- # Run in CLI mode
947
- elif args.mode == 'cli':
948
- if not args.url and not args.file:
949
- logger.error("In CLI mode, you must provide either --url or --file")
950
- sys.exit(1)
951
-
952
- results = []
953
-
954
- # Process URL if provided
955
- if args.url:
956
- from url_processor import URLProcessor
957
-
958
- logger.info(f"Processing URL: {args.url}")
959
- url_processor = URLProcessor()
960
- url_results = url_processor.process_urls([args.url])
961
- results.extend(url_results)
962
-
963
- # Process file if provided
964
- if args.file:
965
- from file_processor import FileProcessor
966
 
967
- if not os.path.exists(args.file):
968
- logger.error(f"File not found: {args.file}")
969
- sys.exit(1)
 
 
970
 
971
- logger.info(f"Processing file: {args.file}")
972
- file_processor = FileProcessor()
973
 
974
- # Create a file-like object with a name attribute
975
- class FileObj:
976
- def __init__(self, path):
977
- self.name = path
978
 
979
- file_results = file_processor.process_file(FileObj(args.file))
980
- results.extend(file_results)
981
-
982
- # Save results
983
- if results:
984
- from utils import save_results
985
 
986
- output_dir = args.output or os.getcwd()
987
- filepath = save_results(results, output_dir)
988
 
989
- if filepath:
990
- logger.info(f"Results saved to: {filepath}")
991
- else:
992
- logger.error("Failed to save results")
993
- else:
994
- logger.warning("No results to save")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
995
 
996
  if __name__ == "__main__":
997
  main()
 
1
+ import base64
2
+ import gradio as gr
3
+ import hashlib
4
+ import io
5
  import json
 
 
6
  import logging
7
  import mimetypes
8
+ import os
9
  from PIL import Image
10
+ import qrcode# Setup logging
11
+ import random
12
+ import re
 
 
 
 
13
  import requests
14
+ import tempfile
15
+ import time
16
  import validators
17
+ import zipfile
18
+ import zxing
19
  from bs4 import BeautifulSoup
 
20
  from cleantext import clean
21
+ from datetime import datetime
22
+ from fake_useragent import UserAgent
23
+ from file_processor import FileProcessor
24
+ from pathlib import Path
25
+ from qr_processor import QRProcessor
26
+ from selenium import webdriver
27
+ from typing import List, Dict, Optional, Union, Any
28
+ from url_processor import URLProcessor
29
+ from urllib.parse import urlparse
30
+ from utils import save_results, extract_urls_from_text, format_results_as_markdown
 
 
 
 
 
 
 
 
 
31
 
32
+ # Configure logging
33
+ import logging
34
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
35
+ logger = logging.getLogger('App')
36
+
37
+ # URLProcessor class
38
+ # ===================
39
  class URLProcessor:
40
+ """Class to handle URL processing with advanced features"""
41
+
42
+ def __init__(self, request_delay: float = 1.0, timeout: int = 30, max_retries: int = 3, respect_robots: bool = True):
43
+ self.request_delay = request_delay
44
+ self.timeout = timeout
45
+ self.max_retries = max_retries
46
+ self.respect_robots = respect_robots
47
+ self.rate_limits = {} # Domain -> (last_access_time, count)
 
 
48
 
49
+ # Initialize session with rotating user agents
50
+ self.session = requests.Session()
51
  self.update_user_agent()
52
 
53
+ # Selenium driver (lazy initialization)
54
+ self._driver = None
 
 
 
55
 
56
  def update_user_agent(self):
57
+ """Rotate user agent to avoid detection"""
58
+ user_agents = [
59
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
60
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
61
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
62
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
63
+ ]
64
+ self.session.headers.update({
65
+ 'User-Agent': random.choice(user_agents),
66
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
67
+ 'Accept-Language': 'en-US,en;q=0.5',
68
+ 'Connection': 'keep-alive',
69
+ 'Upgrade-Insecure-Requests': '1',
70
+ 'Pragma': 'no-cache',
71
+ 'Cache-Control': 'no-cache',
72
+ })
 
73
 
74
  def get_selenium_driver(self):
75
+ """Get or create Selenium WebDriver with proper settings"""
76
+ if self._driver is not None:
77
+ return self._driver
78
 
79
  try:
 
 
80
  from selenium.webdriver.chrome.options import Options
81
+ from selenium.webdriver.chrome.service import Service
82
  from webdriver_manager.chrome import ChromeDriverManager
83
 
84
  options = Options()
85
+ options.add_argument('--headless')
86
+ options.add_argument('--no-sandbox')
87
+ options.add_argument('--disable-dev-shm-usage')
88
+ options.add_argument('--disable-gpu')
89
+ options.add_argument('--window-size=1920,1080')
90
+ options.add_argument(f'user-agent={self.session.headers["User-Agent"]}')
 
91
 
92
  service = Service(ChromeDriverManager().install())
93
+ self._driver = webdriver.Chrome(service=service, options=options)
94
+ return self._driver
95
  except Exception as e:
96
  logger.error(f"Failed to initialize Selenium: {e}")
97
  return None
98
 
99
+ def close(self):
100
+ """Close resources"""
101
+ if self._driver is not None:
102
+ self._driver.quit()
103
+ self._driver = None
104
+
105
+ def handle_rate_limits(self, url: str):
106
+ """Implement rate limiting per domain"""
107
+ parsed_url = urlparse(url)
108
+ parsed_domain = parsed_url.netloc
109
 
 
110
  current_time = time.time()
111
  if parsed_domain in self.rate_limits:
112
  last_access, count = self.rate_limits[parsed_domain]
113
 
114
+ # Determine appropriate delay based on domain
115
+ min_delay = self.request_delay
116
+ if "linkedin.com" in parsed_domain:
117
+ min_delay = 5.0 # LinkedIn is sensitive to scraping
118
  elif "gov" in parsed_domain:
119
  min_delay = 2.0 # Be respectful with government sites
120
  else:
 
210
  except Exception as e:
211
  logger.warning(f"Error handling Google site: {e}")
212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  def check_robots_txt(self, url: str) -> bool:
214
  """Check if URL is allowed by robots.txt"""
215
  if not self.respect_robots:
 
231
  logger.warning(f"Error checking robots.txt: {e}")
232
  return True
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def fetch_content(self, url: str) -> Optional[Dict]:
235
  """Universal content fetcher with special case handling"""
236
  try:
 
279
  return None
280
 
281
  def _fetch_html_content(self, url: str) -> Optional[Dict]:
282
+ """Enhanced HTML content processing to extract everything"""
283
  try:
284
  response = self.session.get(url, timeout=self.timeout)
285
  response.raise_for_status()
286
 
287
+ # Store the original HTML
288
+ original_html = response.text
289
+
290
+ # Parse with BeautifulSoup
291
  soup = BeautifulSoup(response.text, 'html.parser')
292
+
293
+ # Extract all text content
294
+ text_content = soup.get_text(separator='\n', strip=True)
295
+
296
+ # Extract all links
297
+ links = []
298
+ for link in soup.find_all('a', href=True):
299
+ href = link['href']
300
+ # Convert relative URLs to absolute
301
+ if href.startswith('/'):
302
+ from urllib.parse import urlparse, urljoin
303
+ parsed_url = urlparse(url)
304
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
305
+ href = urljoin(base_url, href)
306
 
307
+ link_text = link.get_text(strip=True)
308
+ links.append({
309
+ 'url': href,
310
+ 'text': link_text if link_text else '[No text]'
311
+ })
312
 
313
+ # Extract all images
314
+ images = []
315
+ for img in soup.find_all('img', src=True):
316
+ src = img['src']
317
+ # Convert relative URLs to absolute
318
+ if src.startswith('/'):
319
+ from urllib.parse import urlparse, urljoin
320
+ parsed_url = urlparse(url)
321
+ base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
322
+ src = urljoin(base_url, src)
323
+
324
+ alt_text = img.get('alt', '')
325
+ images.append({
326
+ 'src': src,
327
+ 'alt': alt_text if alt_text else '[No alt text]'
328
+ })
329
+
330
+ # Extract all scripts
331
+ scripts = []
332
+ for script in soup.find_all('script'):
333
+ script_content = script.string
334
+ if script_content:
335
+ scripts.append(script_content)
336
+
337
+ # Extract all styles
338
+ styles = []
339
+ for style in soup.find_all('style'):
340
+ style_content = style.string
341
+ if style_content:
342
+ styles.append(style_content)
343
+
344
+ # Extract metadata
345
+ metadata = {}
346
+ for meta in soup.find_all('meta'):
347
+ if meta.get('name') and meta.get('content'):
348
+ metadata[meta['name']] = meta['content']
349
+ elif meta.get('property') and meta.get('content'):
350
+ metadata[meta['property']] = meta['content']
351
 
352
+ # Extract title
353
+ title = soup.title.string if soup.title else ''
354
 
355
+ # Return comprehensive data
356
  return {
357
+ 'url': url,
358
+ 'title': title,
359
+ 'metadata': metadata,
360
+ 'content': text_content,
361
+ 'html': original_html,
362
+ 'links': links,
363
+ 'images': images,
364
+ 'scripts': scripts,
365
+ 'styles': styles,
366
  'content_type': response.headers.get('Content-Type', ''),
367
  'timestamp': datetime.now().isoformat()
368
  }
 
370
  logger.error(f"HTML processing failed: {e}")
371
  return None
372
 
373
+ def advanced_text_cleaning(self, text: str) -> str:
374
+ """Robust text cleaning with version compatibility"""
375
+ try:
376
+ # Try to use cleantext if available
377
+ import importlib.util
378
+ if importlib.util.find_spec("cleantext") is not None:
379
+ from cleantext import clean
380
+ cleaned_text = clean(
381
+ text,
382
+ fix_unicode=True,
383
+ to_ascii=True,
384
+ lower=True,
385
+ no_line_breaks=True,
386
+ no_urls=True,
387
+ no_emails=True,
388
+ no_phone_numbers=True,
389
+ no_numbers=False,
390
+ no_digits=False,
391
+ no_currency_symbols=True,
392
+ no_punct=False
393
+ ).strip()
394
+ return cleaned_text
395
+ else:
396
+ # Fallback cleaning
397
+ text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
398
+ text = text.encode('ascii', 'ignore').decode('ascii')
399
+ text = re.sub(r'\s+', ' ', text)
400
+ return text.strip()
401
+ except Exception as e:
402
+ logger.warning(f"Text cleaning error: {e}")
403
+ return text.strip() if text else ""
404
+
405
+ def process_urls(self, urls: List[str], mode: str = 'basic') -> List[Dict]:
406
+ """Process a list of URLs with different modes"""
407
+ results = []
408
+
409
+ for url in urls:
410
+ # Validate URL
411
+ if not validators.url(url):
412
+ results.append({
413
+ 'url': url,
414
+ 'error': 'Invalid URL format',
415
+ 'timestamp': datetime.now().isoformat()
416
+ })
417
+ continue
418
+
419
+ # Check robots.txt
420
+ if not self.check_robots_txt(url):
421
+ results.append({
422
+ 'url': url,
423
+ 'error': 'Access disallowed by robots.txt',
424
+ 'timestamp': datetime.now().isoformat()
425
+ })
426
+ continue
427
+
428
+ # Apply rate limiting
429
+ self.handle_rate_limits(url)
430
+
431
+ # Process based on mode
432
+ try:
433
+ if mode == 'basic':
434
+ content = self.fetch_content(url)
435
+ if content:
436
+ results.append(content)
437
+ else:
438
+ results.append({
439
+ 'url': url,
440
+ 'error': 'Failed to fetch content',
441
+ 'timestamp': datetime.now().isoformat()
442
+ })
443
+
444
+ elif mode == 'interactive':
445
+ content = self.handle_interactive_site(url)
446
+ if content:
447
+ results.append(content)
448
+ else:
449
+ # Fallback to basic mode
450
+ content = self.fetch_content(url)
451
+ if content:
452
+ results.append(content)
453
+ else:
454
+ results.append({
455
+ 'url': url,
456
+ 'error': 'Failed to fetch content in interactive mode',
457
+ 'timestamp': datetime.now().isoformat()
458
+ })
459
+
460
+ elif mode == 'deep':
461
+ # Deep mode: get main content and follow some links
462
+ main_content = self.fetch_content(url)
463
+ if not main_content:
464
+ results.append({
465
+ 'url': url,
466
+ 'error': 'Failed to fetch main content',
467
+ 'timestamp': datetime.now().isoformat()
468
+ })
469
+ continue
470
+
471
+ results.append(main_content)
472
+
473
+ # Follow up to 5 links from the main page
474
+ if 'links' in main_content and main_content['links']:
475
+ followed_count = 0
476
+ for link_data in main_content['links'][:10]: # Consider first 10 links
477
+ link_url = link_data['url']
478
+
479
+ # Skip external links and non-http(s) links
480
+ if not link_url.startswith(('http://', 'https://')):
481
+ continue
482
+
483
+ # Skip if not same domain
484
+ main_domain = urlparse(url).netloc
485
+ link_domain = urlparse(link_url).netloc
486
+ if main_domain != link_domain:
487
+ continue
488
+
489
+ # Apply rate limiting
490
+ self.handle_rate_limits(link_url)
491
+
492
+ # Fetch the linked content
493
+ link_content = self.fetch_content(link_url)
494
+ if link_content:
495
+ results.append(link_content)
496
+ followed_count += 1
497
+
498
+ # Limit to 5 followed links
499
+ if followed_count >= 5:
500
+ break
501
+
502
+ except Exception as e:
503
+ logger.error(f"Error processing URL {url}: {e}")
504
+ results.append({
505
+ 'url': url,
506
+ 'error': f"Processing error: {str(e)}",
507
+ 'timestamp': datetime.now().isoformat()
508
+ })
509
+
510
+
511
+ # FileProcessor class
512
+ # ===================
513
  class FileProcessor:
514
  """Class to handle file processing with enhanced capabilities"""
515
 
 
761
  if file_stat.st_size > 100 * 1024 * 1024: # 100MB
762
  logger.info(f"Processing large file: {file_path} ({file_stat.st_size} bytes)")
763
 
764
+ content = ""
765
  with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
766
  content = f.read(1 * 1024 * 1024) # First 1MB
767
  content += "\n...[Content truncated due to large file size]...\n"
 
769
  f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
770
  content += f.read() # Last 1MB
771
  else:
772
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
773
  content = f.read()
774
+
775
+ return [{
776
+ 'source': 'file',
777
+ 'filename': filename,
778
+ 'file_size': file_stat.st_size,
779
+ 'mime_type': mime_type,
780
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
781
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
782
+ 'content': content,
783
+ 'timestamp': datetime.now().isoformat()
784
+ }]
785
  else:
786
+ # For binary files, extract metadata and try specialized extraction
787
+ if file_path.endswith(('.pdf', '.doc', '.docx')):
788
+ return self._process_document_file(file_path)
789
+ elif file_path.endswith(('.jpg', '.jpeg', '.png', '.gif', '.bmp')):
790
+ return self._process_image_file(file_path)
791
+ elif file_path.endswith(('.mp3', '.wav', '.ogg', '.mp4', '.avi', '.mov')):
792
+ return self._process_media_file(file_path)
793
+ else:
794
+ # Generic binary file handling
795
+ return [{
796
+ 'source': 'binary_file',
797
+ 'filename': filename,
798
+ 'file_size': file_stat.st_size,
799
+ 'mime_type': mime_type,
800
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
801
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
802
+ 'content': f"[Binary file: {mime_type or 'unknown type'}]",
803
+ 'timestamp': datetime.now().isoformat()
804
+ }]
805
+ except Exception as e:
806
+ logger.error(f"File processing error: {e}")
807
  return [{
808
+ 'source': 'error',
809
+ 'filename': os.path.basename(file.name) if file else 'unknown',
810
+ 'error': str(e),
 
 
 
 
811
  'timestamp': datetime.now().isoformat()
812
  }]
 
 
 
813
 
814
+ def _process_pdf_file(self, file_path: str) -> List[Dict]:
815
+ """Extract text from PDF files"""
816
  try:
817
+ # Try to import PyPDF2 module
818
+ import importlib.util
819
+ if importlib.util.find_spec("PyPDF2") is None:
820
+ return [{
821
+ "error": "PDF processing requires the 'PyPDF2' module. Install with 'pip install PyPDF2'."
822
+ }]
823
+
824
+ import PyPDF2
825
+
826
+ with open(file_path, 'rb') as file:
827
+ reader = PyPDF2.PdfReader(file)
828
+ num_pages = len(reader.pages)
829
+
830
+ # Extract text from each page
831
+ all_text = ""
832
+ page_texts = []
833
+
834
+ for i in range(num_pages):
835
+ page = reader.pages[i]
836
+ text = page.extract_text()
837
+ all_text += text + "\n\n"
838
+ page_texts.append({
839
+ "page_number": i + 1,
840
+ "content": text
841
+ })
842
+
843
+ # Get file metadata
844
+ file_stat = os.stat(file_path)
845
+
846
+ return [{
847
+ "source": "pdf",
848
+ "filename": os.path.basename(file_path),
849
+ "file_size": file_stat.st_size,
850
+ "mime_type": "application/pdf",
851
+ "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
852
+ "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
853
+ "num_pages": num_pages,
854
+ "content": all_text,
855
+ "pages": page_texts,
856
+ "timestamp": datetime.now().isoformat()
857
+ }]
858
  except Exception as e:
859
+ logger.error(f"Error processing PDF file: {str(e)}")
860
+ return [{
861
+ "source": "error",
862
+ "filename": os.path.basename(file_path),
863
+ "error": f"Error processing PDF file: {str(e)}",
864
+ "timestamp": datetime.now().isoformat()
865
+ }]
866
+
867
+ def _process_image_file(self, file_path: str) -> List[Dict]:
868
+ """Extract metadata and attempt OCR on image files"""
869
  try:
870
+ # Try to import PIL module
871
+ import importlib.util
872
+ if importlib.util.find_spec("PIL") is None:
873
+ return [{
874
+ "error": "Image processing requires the 'Pillow' module. Install with 'pip install Pillow'."
875
+ }]
876
+
877
+ from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878
 
879
+ # Open image and get basic metadata
880
+ with Image.open(file_path) as img:
881
+ width, height = img.size
882
+ format_name = img.format
883
+ mode = img.mode
884
+
885
+ # Extract EXIF data if available
886
+ exif_data = {}
887
+ if hasattr(img, '_getexif') and img._getexif():
888
+ exif = img._getexif()
889
+ if exif:
890
+ for tag_id, value in exif.items():
891
+ tag_name = f"tag_{tag_id}"
892
+ exif_data[tag_name] = str(value)
893
+
894
+ # Try OCR if pytesseract is available
895
+ ocr_text = None
896
+ if importlib.util.find_spec("pytesseract") is not None:
897
+ try:
898
+ import pytesseract
899
+ ocr_text = pytesseract.image_to_string(img)
900
+ except Exception as e:
901
+ logger.warning(f"OCR failed: {e}")
902
+
903
+ # Get file metadata
904
+ file_stat = os.stat(file_path)
905
+
906
+ return [{
907
+ "source": "image",
908
+ "filename": os.path.basename(file_path),
909
+ "file_size": file_stat.st_size,
910
+ "mime_type": f"image/{format_name.lower()}" if format_name else "image/unknown",
911
+ "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
912
+ "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
913
+ "width": width,
914
+ "height": height,
915
+ "format": format_name,
916
+ "mode": mode,
917
+ "exif": exif_data,
918
+ "ocr_text": ocr_text,
919
+ "content": ocr_text if ocr_text else f"[Image: {width}x{height} {format_name}]",
920
+ "timestamp": datetime.now().isoformat()
921
+ }]
922
  except Exception as e:
923
+ logger.error(f"Error processing image file: {str(e)}")
924
+ return [{
925
+ "source": "error",
926
+ "filename": os.path.basename(file_path),
927
+ "error": f"Error processing image file: {str(e)}",
928
+ "timestamp": datetime.now().isoformat()
929
+ }]
930
+
931
+ def _process_media_file(self, file_path: str) -> List[Dict]:
932
+ """Extract metadata from audio/video files"""
933
+ try:
934
+ # Try to import mutagen module
935
+ import importlib.util
936
+ if importlib.util.find_spec("mutagen") is None:
937
+ return [{
938
+ "error": "Media processing requires the 'mutagen' module. Install with 'pip install mutagen'."
939
+ }]
940
 
941
+ import mutagen
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
942
 
943
+ # Get file metadata
944
+ file_stat = os.stat(file_path)
945
+ mime_type, _ = mimetypes.guess_type(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
946
 
947
+ # Extract media metadata
948
+ media_info = mutagen.File(file_path)
949
+
950
+ metadata = {}
951
+ if media_info:
952
+ # Extract common metadata
953
+ if hasattr(media_info, 'info') and hasattr(media_info.info, 'length'):
954
+ metadata['duration'] = media_info.info.length
 
 
 
 
 
 
 
 
955
 
956
+ # Extract tags
957
+ for key, value in media_info.items():
958
+ if isinstance(value, list) and len(value) == 1:
959
+ metadata[key] = str(value[0])
960
+ else:
961
+ metadata[key] = str(value)
962
+
963
+ return [{
964
+ "source": "media",
965
+ "filename": os.path.basename(file_path),
966
+ "file_size": file_stat.st_size,
967
+ "mime_type": mime_type,
968
+ "created": datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
969
+ "modified": datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
970
+ "metadata": metadata,
971
+ "content": f"[Media file: {mime_type or 'unknown type'}]",
972
+ "timestamp": datetime.now().isoformat()
973
+ }]
974
  except Exception as e:
975
+ logger.error(f"Error processing media file: {str(e)}")
976
+ return [{
977
+ "source": "error",
978
+ "filename": os.path.basename(file_path),
979
+ "error": f"Error processing media file: {str(e)}",
980
+ "timestamp": datetime.now().isoformat()
981
+
982
+ # QRProcessor class
983
+ # =================
984
+ class QRProcessor:
985
+ """Class to handle QR code processing"""
 
 
 
 
 
 
 
 
 
986
 
987
+ def __init__(self):
988
+ # Check for required libraries
989
+ self._check_dependencies()
 
 
 
 
 
 
 
 
 
 
 
 
990
 
991
+ def _check_dependencies(self):
992
+ """Check if required libraries are installed"""
993
+ try:
994
+ import importlib.util
995
+
996
+ # Check for pyzbar
997
+ if importlib.util.find_spec("pyzbar") is None:
998
+ logger.warning("pyzbar library not found. QR code detection will not work. Install with 'pip install pyzbar'")
999
+
1000
+ # Check for qrcode
1001
+ if importlib.util.find_spec("qrcode") is None:
1002
+ logger.warning("qrcode library not found. QR code generation will not work. Install with 'pip install qrcode'")
1003
+
1004
+ except ImportError as e:
1005
+ logger.error(f"Error checking dependencies: {e}")
1006
 
1007
+ def detect_qr_codes(self, image_path: str) -> List[Dict]:
1008
+ """Detect QR codes in an image"""
1009
+ try:
1010
+ import importlib.util
1011
+ if importlib.util.find_spec("pyzbar") is None:
1012
+ return [{"error": "pyzbar library not found. Install with 'pip install pyzbar'"}]
1013
+
1014
+ from pyzbar.pyzbar import decode
1015
+ from PIL import Image
1016
+
1017
+ # Open the image
1018
+ image = Image.open(image_path)
1019
+
1020
+ # Decode QR codes
1021
+ decoded_objects = decode(image)
1022
+
1023
+ results = []
1024
+ for obj in decoded_objects:
1025
+ # Get the bounding box
1026
+ rect = obj.rect
1027
+ bbox = {
1028
+ 'left': rect.left,
1029
+ 'top': rect.top,
1030
+ 'width': rect.width,
1031
+ 'height': rect.height
1032
+ }
1033
+
1034
+ # Get the data
1035
+ data = obj.data.decode('utf-8', errors='replace')
1036
+
1037
+ # Get the type
1038
+ qr_type = obj.type
1039
+
1040
+ results.append({
1041
+ 'type': qr_type,
1042
+ 'data': data,
1043
+ 'bbox': bbox,
1044
+ 'timestamp': datetime.now().isoformat()
1045
+ })
1046
+
1047
+ if not results:
1048
+ results.append({
1049
+ 'warning': 'No QR codes detected in the image',
1050
+ 'timestamp': datetime.now().isoformat()
1051
+ })
1052
+
1053
+ return results
1054
+
1055
+ except Exception as e:
1056
+ logger.error(f"Error detecting QR codes: {e}")
1057
+ return [{"error": f"Error detecting QR codes: {str(e)}"}]
1058
 
1059
+ def generate_qr_code(self, data: str, output_path: Optional[str] = None, size: int = 10) -> Dict:
1060
+ """Generate a QR code from data"""
1061
+ try:
1062
+ import importlib.util
1063
+ if importlib.util.find_spec("qrcode") is None:
1064
+ return {"error": "qrcode library not found. Install with 'pip install qrcode'"}
1065
+
1066
+ import qrcode
1067
+
1068
+ # Create QR code instance
1069
+ qr = qrcode.QRCode(
1070
+ version=1,
1071
+ error_correction=qrcode.constants.ERROR_CORRECT_L,
1072
+ box_size=size,
1073
+ border=4,
1074
+ )
1075
+
1076
+ # Add data
1077
+ qr.add_data(data)
1078
+ qr.make(fit=True)
1079
+
1080
+ # Create an image from the QR Code instance
1081
+ img = qr.make_image(fill_color="black", back_color="white")
1082
+
1083
+ # Save the image if output path is provided
1084
+ if output_path:
1085
+ img.save(output_path)
1086
+ return {
1087
+ 'success': True,
1088
+ 'data': data,
1089
+ 'output_path': output_path,
1090
+ 'timestamp': datetime.now().isoformat()
1091
+ }
1092
  else:
1093
+ # Save to a temporary file
1094
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
1095
+ temp_path = tmp.name
1096
+ img.save(temp_path)
1097
+ return {
1098
+ 'success': True,
1099
+ 'data': data,
1100
+ 'output_path': temp_path,
1101
+ 'timestamp': datetime.now().isoformat()
1102
+ }
1103
+
1104
+ except Exception as e:
1105
+ logger.error(f"Error generating QR code: {e}")
1106
+ return {"error": f"Error generating QR code: {str(e)}"}
1107
 
1108
+ def extract_qr_from_url(self, url_processor, url: str) -> List[Dict]:
1109
+ """Extract QR codes from an image URL"""
1110
  try:
1111
+ # Fetch the image from the URL
1112
+ response = url_processor.session.get(url, stream=True)
1113
+ response.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1114
 
1115
+ # Save to a temporary file
1116
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
1117
+ temp_path = tmp.name
1118
+ for chunk in response.iter_content(chunk_size=128):
1119
+ tmp.write(chunk)
1120
 
1121
+ # Process the image
1122
+ results = self.detect_qr_codes(temp_path)
1123
 
1124
+ # Add source information
1125
+ for result in results:
1126
+ result['source_url'] = url
 
1127
 
1128
+ # Clean up
1129
+ os.unlink(temp_path)
 
 
 
 
1130
 
1131
+ return results
 
1132
 
1133
+ except Exception as e:
1134
+ logger.error(f"Error extracting QR from URL: {e}")
1135
+ return [{"error": f"Error extracting QR from URL: {str(e)}"}]
1136
+
1137
+ def batch_process_images(self, image_paths: List[str]) -> Dict[str, List[Dict]]:
1138
+ """Process multiple images for QR codes"""
1139
+ results = {}
1140
+
1141
+ for image_path in image_paths:
1142
+ try:
1143
+ if os.path.exists(image_path):
1144
+ image_results = self.detect_qr_codes(image_path)
1145
+ results[image_path] = image_results
1146
+ else:
1147
+ results[image_path] = [{"error": f"Image file not found: {image_path}"}]
1148
+ except Exception as e:
1149
+ logger.error(f"Error processing image {image_path}: {e}")
1150
+ results[image_path] = [{"error": f"Processing error: {str(e)}"}]
1151
+
1152
+ def create_interface():
1153
+ """Create a comprehensive Gradio interface with advanced features"""
1154
+ css = """
1155
+ .container { max-width: 1200px; margin: auto; }
1156
+ .warning { background-color: #fff3cd; color: #856404; }
1157
+ .error { background-color: #f8d7da; color: #721c24; }
1158
+ """
1159
+
1160
+ with gr.Blocks(css=css, title="Advanced Text & URL Processor") as interface:
1161
+ gr.Markdown("# 🌐 Advanced URL & Text Processing Toolkit")
1162
+
1163
+ with gr.Tab("URL Processing"):
1164
+ url_input = gr.Textbox(
1165
+ label="Enter URLs (comma or newline separated)",
1166
+ lines=5,
1167
+ placeholder="https://example1.com\nhttps://example2.com"
1168
+ )
1169
+
1170
+ with gr.Tab("File Input"):
1171
+ file_input = gr.File(
1172
+ label="Upload text file or ZIP archive",
1173
+ file_types=[".txt", ".zip", ".md", ".csv", ".json", ".xml"]
1174
+ )
1175
+
1176
+ with gr.Tab("Text Input"):
1177
+ text_input = gr.Textbox(
1178
+ label="Raw Text Input",
1179
+ lines=5,
1180
+ placeholder="Paste your text here..."
1181
+ )
1182
+
1183
+ with gr.Tab("JSON Editor"):
1184
+ json_editor = gr.Textbox(
1185
+ label="JSON Editor",
1186
+ lines=20,
1187
+ placeholder="View and edit your JSON data here...",
1188
+ interactive=True,
1189
+ elem_id="json-editor" # Optional: for custom styling
1190
+ )
1191
+
1192
+ with gr.Tab("Scratchpad"):
1193
+ scratchpad = gr.Textbox(
1194
+ label="Scratchpad",
1195
+ lines=10,
1196
+ placeholder="Quick notes or text collections...",
1197
+ interactive=True
1198
+ )
1199
+
1200
+ process_btn = gr.Button("Process Input", variant="primary")
1201
+ qr_btn = gr.Button("Generate QR Code", variant="secondary")
1202
+
1203
+ output_text = gr.Textbox(label="Processing Results", interactive=False)
1204
+ output_file = gr.File(label="Processed Output")
1205
+ qr_output = gr.Image(label="QR Code", type="filepath") # To display the generated QR code
1206
+
1207
+ process_btn.click(
1208
+ process_all_inputs,
1209
+ inputs=[url_input, file_input, text_input, scratchpad],
1210
+ outputs=[output_file, output_text, json_editor] # Update outputs to include JSON editor
1211
+ )
1212
+
1213
+ qr_btn.click(
1214
+ generate_qr_code,
1215
+ inputs=json_editor,
1216
+ outputs=qr_output
1217
+ )
1218
+
1219
+ gr.Markdown("""
1220
+ ### Usage Guidelines
1221
+ - **URL Processing**: Enter valid HTTP/HTTPS URLs
1222
+ - **File Input**: Upload text files or ZIP archives
1223
+ - ** Text Input**: Direct text processing
1224
+ - **JSON Editor**: View and edit your JSON data
1225
+ - **Scratchpad**: Quick notes or text collections
1226
+ - Advanced cleaning and validation included
1227
+ """)
1228
+ return interface
1229
+
1230
+ def main():
1231
+ # Configure system settings
1232
+ mimetypes.init()
1233
+
1234
+ # Create and launch interface
1235
+ interface = create_interface()
1236
+
1237
+ # Launch with proper configuration
1238
+ interface.launch(
1239
+ server_name="0.0.0.0",
1240
+ server_port=7860,
1241
+ show_error=True,
1242
+ share=False,
1243
+ inbrowser=True,
1244
+ debug=True
1245
+ )
1246
 
1247
  if __name__ == "__main__":
1248
  main()