acecalisto3 commited on
Commit
a2b3dd2
·
verified ·
1 Parent(s): 41b4ac1

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +276 -63
app2.py CHANGED
@@ -19,6 +19,8 @@ from bs4 import BeautifulSoup
19
  from fake_useragent import UserAgent
20
  from cleantext import clean
21
  import qrcode# Setup logging
 
 
22
  logging.basicConfig(
23
  level=logging.INFO,
24
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
@@ -41,22 +43,227 @@ class URLProcessor:
41
  self.respect_robots = True
42
  self.use_proxy = False
43
  self.proxy_url = None
 
 
44
 
45
- # Update session headers
46
- self.session.headers.update({
47
- 'User-Agent': UserAgent().random,
48
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
49
- 'Accept-Language': 'en-US,en;q=0.5',
50
- 'Accept-Encoding': 'gzip, deflate, br',
51
- 'Connection': 'keep-alive',
52
- 'Upgrade-Insecure-Requests': '1'
53
- })
54
 
55
  if self.use_proxy and self.proxy_url:
56
  self.session.proxies = {
57
  'http': self.proxy_url,
58
  'https': self.proxy_url
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  def check_robots_txt(self, url: str) -> bool:
62
  """Check if URL is allowed by robots.txt"""
@@ -368,7 +575,7 @@ class FileProcessor:
368
  qr.add_data(json_str)
369
  qr.make(fit=True)
370
 
371
- img = qr.make_image(fill_color="black", back_color="white")
372
  output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
373
  img.save(str(output_path))
374
  paths.append(str(output_path))
@@ -395,70 +602,76 @@ class FileProcessor:
395
  except Exception as e:
396
  logger.error(f"QR generation error: {e}")
397
  return []
398
-
399
- def decode_qr_code(image_path: str) -> Optional[str]:
400
- """Decode QR code from an image file using ZXing"""
401
- try:
402
- reader = zxing.BarCodeReader()
403
- result = reader.decode(image_path)
404
-
405
- if result and result.parsed:
406
- return result.parsed
407
- logger.warning("No QR code found in image")
408
- return None
409
- except Exception as e:
410
- logger.error(f"QR decoding error: {e}")
411
  return None
412
-
413
- def decode_qr(image) -> List[str]:
414
- """Decode all QR codes found in an image using ZXing"""
415
- try:
416
- if isinstance(image, str):
417
- image_path = image
418
- else:
419
- # Save temporary image if input is not a path
420
- with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
421
- Image.fromarray(image).save(tmp.name)
422
- image_path = tmp.name
423
-
424
- reader = zxing.BarCodeReader()
425
- result = reader.decode(image_path)
426
 
427
- if result and result.parsed:
428
- return [result.parsed]
429
- return []
430
- except Exception as e:
431
- logger.error(f"QR decoding error: {e}")
432
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- raise ValueError("Unable to decode QR code")
435
- except Exception as e:
436
- logger.error(f"QR decoding error: {e}")
437
- return None, None # Return None for both data and resolution in case of error
438
-
439
- def datachat_trained(data_input: str, query: str) -> str:
440
- """Handle trained data interaction logic"""
441
- data = clean_json(data_input)
442
- if not data:
443
- return "Invalid JSON data provided."
444
- return f"[Trained Mode]\nData: {json.dumps(data, indent=2)}\nQuery: {query}"
445
-
446
- def datachat_simple(data_input: str, query: str) -> str:
447
- """Handle simple chat interaction logic"""
448
- data = clean_json(data_input)
449
- if not data:
450
- return "Invalid JSON data provided."
451
- return f"[Chat Mode]\nData: {json.dumps(data, indent=2)}\nQuestion: {query}"
452
 
 
453
  def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
454
- """Interface for DataChat functionality"""
455
  data = None
456
  if data_source == "JSON Input":
457
  data = json_input
458
  elif data_source == "QR Code":
459
  try:
460
  decoded_data = decode_qr_code(qr_image)
461
- data = decoded_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
  if not data:
463
  return "No QR code found in the provided image."
464
  except Exception as e:
 
19
  from fake_useragent import UserAgent
20
  from cleantext import clean
21
  import qrcode# Setup logging
22
+ import base64
23
+ import io
24
  logging.basicConfig(
25
  level=logging.INFO,
26
  format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
 
43
  self.respect_robots = True
44
  self.use_proxy = False
45
  self.proxy_url = None
46
+ self.rate_limits = {} # Track rate limits per domain
47
+ self.selenium_driver = None
48
 
49
+ # Update session headers with rotating user agents
50
+ self.update_user_agent()
 
 
 
 
 
 
 
51
 
52
  if self.use_proxy and self.proxy_url:
53
  self.session.proxies = {
54
  'http': self.proxy_url,
55
  'https': self.proxy_url
56
  }
57
+
58
+ def update_user_agent(self):
59
+ """Rotate user agents to avoid detection"""
60
+ try:
61
+ self.session.headers.update({
62
+ 'User-Agent': UserAgent().random,
63
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
64
+ 'Accept-Language': 'en-US,en;q=0.5',
65
+ 'Accept-Encoding': 'gzip, deflate, br',
66
+ 'Connection': 'keep-alive',
67
+ 'Upgrade-Insecure-Requests': '1',
68
+ 'Cache-Control': 'max-age=0'
69
+ })
70
+ except Exception as e:
71
+ logger.warning(f"Failed to update user agent: {e}")
72
+ # Fallback to a common user agent
73
+ self.session.headers.update({
74
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
75
+ })
76
+
77
+ def get_selenium_driver(self):
78
+ """Initialize Selenium WebDriver for interactive sites"""
79
+ if self.selenium_driver is not None:
80
+ return self.selenium_driver
81
+
82
+ try:
83
+ from selenium import webdriver
84
+ from selenium.webdriver.chrome.service import Service
85
+ from selenium.webdriver.chrome.options import Options
86
+ from webdriver_manager.chrome import ChromeDriverManager
87
+
88
+ options = Options()
89
+ options.add_argument("--headless")
90
+ options.add_argument("--no-sandbox")
91
+ options.add_argument("--disable-dev-shm-usage")
92
+ options.add_argument(f"user-agent={self.session.headers['User-Agent']}")
93
+ options.add_argument("--disable-notifications")
94
+ options.add_argument("--disable-popup-blocking")
95
+ options.add_argument("--disable-extensions")
96
+
97
+ service = Service(ChromeDriverManager().install())
98
+ self.selenium_driver = webdriver.Chrome(service=service, options=options)
99
+ return self.selenium_driver
100
+ except Exception as e:
101
+ logger.error(f"Failed to initialize Selenium: {e}")
102
+ return None
103
+
104
+ def handle_rate_limits(self, domain):
105
+ """Smart rate limiting based on domain"""
106
+ from urllib.parse import urlparse
107
+ import time
108
+
109
+ # Extract domain from URL
110
+ parsed_domain = urlparse(domain).netloc
111
+
112
+ # Check if we've accessed this domain recently
113
+ current_time = time.time()
114
+ if parsed_domain in self.rate_limits:
115
+ last_access, count = self.rate_limits[parsed_domain]
116
+
117
+ # Different delay strategies for different domains
118
+ if "facebook" in parsed_domain or "instagram" in parsed_domain:
119
+ min_delay = 5.0 # Longer delay for social media sites
120
+ elif "gov" in parsed_domain:
121
+ min_delay = 2.0 # Be respectful with government sites
122
+ else:
123
+ min_delay = self.request_delay
124
+
125
+ # Exponential backoff if we're making many requests
126
+ if count > 10:
127
+ min_delay *= 2
128
+
129
+ # Wait if needed
130
+ elapsed = current_time - last_access
131
+ if elapsed < min_delay:
132
+ time.sleep(min_delay - elapsed)
133
+
134
+ # Update count
135
+ self.rate_limits[parsed_domain] = (time.time(), count + 1)
136
+ else:
137
+ # First time accessing this domain
138
+ self.rate_limits[parsed_domain] = (current_time, 1)
139
+
140
+ def handle_interactive_site(self, url):
141
+ """Handle sites that require interaction to bypass blocks"""
142
+ driver = self.get_selenium_driver()
143
+ if not driver:
144
+ return None
145
+
146
+ try:
147
+ driver.get(url)
148
+
149
+ # Wait for page to load
150
+ import time
151
+ time.sleep(3)
152
+
153
+ # Handle different types of sites
154
+ if "facebook.com" in url or "instagram.com" in url:
155
+ self._handle_social_media_site(driver)
156
+ elif "google.com" in url:
157
+ self._handle_google_site(driver)
158
+
159
+ # Get the page source after interaction
160
+ page_source = driver.page_source
161
+
162
+ return {
163
+ 'content': page_source,
164
+ 'content_type': 'text/html',
165
+ 'url': url,
166
+ 'title': driver.title
167
+ }
168
+ except Exception as e:
169
+ logger.error(f"Error handling interactive site {url}: {e}")
170
+ return None
171
+
172
+ def _handle_social_media_site(self, driver):
173
+ """Handle Facebook/Instagram login walls"""
174
+ from selenium.webdriver.common.by import By
175
+ from selenium.webdriver.common.keys import Keys
176
+ from selenium.webdriver.support.ui import WebDriverWait
177
+ from selenium.webdriver.support import expected_conditions as EC
178
+
179
+ try:
180
+ # Try to find and close login popups
181
+ close_buttons = driver.find_elements(By.XPATH, "//button[contains(@aria-label, 'Close')]")
182
+ if close_buttons:
183
+ close_buttons[0].click()
184
+ time.sleep(1)
185
+
186
+ # Press ESC key to dismiss popups
187
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
188
+ time.sleep(1)
189
+
190
+ # Scroll down to load more content
191
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
192
+ time.sleep(2)
193
+ except Exception as e:
194
+ logger.warning(f"Error handling social media site: {e}")
195
+
196
+ def _handle_google_site(self, driver):
197
+ """Handle Google authentication and consent pages"""
198
+ from selenium.webdriver.common.by import By
199
+
200
+ try:
201
+ # Look for consent buttons
202
+ consent_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'Accept all')]")
203
+ if consent_buttons:
204
+ consent_buttons[0].click()
205
+ time.sleep(1)
206
+
207
+ # Look for "I agree" buttons
208
+ agree_buttons = driver.find_elements(By.XPATH, "//button[contains(text(), 'I agree')]")
209
+ if agree_buttons:
210
+ agree_buttons[0].click()
211
+ time.sleep(1)
212
+ except Exception as e:
213
+ logger.warning(f"Error handling Google site: {e}")
214
+
215
+ def fetch_content(self, url: str) -> Optional[Dict]:
216
+ """Fetch content with smart handling for different sites"""
217
+ # Check if URL is allowed by robots.txt
218
+ if self.respect_robots and not self.check_robots_txt(url):
219
+ logger.warning(f"URL {url} is disallowed by robots.txt")
220
+ return None
221
+
222
+ # Apply rate limiting
223
+ self.handle_rate_limits(url)
224
+
225
+ # Rotate user agent occasionally
226
+ if random.random() < 0.3: # 30% chance to rotate
227
+ self.update_user_agent()
228
+
229
+ # Determine if site needs special handling
230
+ needs_selenium = any(domain in url.lower() for domain in [
231
+ 'facebook.com', 'instagram.com', 'linkedin.com',
232
+ 'google.com/search', 'twitter.com', 'x.com'
233
+ ])
234
+
235
+ for attempt in range(self.max_retries):
236
+ try:
237
+ if needs_selenium:
238
+ return self.handle_interactive_site(url)
239
+
240
+ # Try with cloudscraper first for sites with anti-bot measures
241
+ if any(domain in url.lower() for domain in ['cloudflare', '.gov']):
242
+ import cloudscraper
243
+ scraper = cloudscraper.create_scraper(
244
+ browser={'browser': 'chrome', 'platform': 'darwin', 'mobile': False}
245
+ )
246
+ response = scraper.get(url, timeout=self.timeout)
247
+ else:
248
+ # Standard request for most sites
249
+ response = self.session.get(url, timeout=self.timeout)
250
+
251
+ response.raise_for_status()
252
+
253
+ return {
254
+ 'content': response.text,
255
+ 'content_type': response.headers.get('Content-Type', ''),
256
+ 'url': url,
257
+ 'status_code': response.status_code
258
+ }
259
+ except Exception as e:
260
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
261
+ if attempt < self.max_retries - 1:
262
+ # Exponential backoff
263
+ time.sleep(self.request_delay * (2 ** attempt))
264
+
265
+ logger.error(f"All attempts failed for {url}")
266
+ return None
267
 
268
  def check_robots_txt(self, url: str) -> bool:
269
  """Check if URL is allowed by robots.txt"""
 
575
  qr.add_data(json_str)
576
  qr.make(fit=True)
577
 
578
+ img = qrcode.make_image(fill_color="black", back_color="white")
579
  output_path = output_dir / f'item_{idx}_qr_{int(time.time())}.png'
580
  img.save(str(output_path))
581
  paths.append(str(output_path))
 
602
  except Exception as e:
603
  logger.error(f"QR generation error: {e}")
604
  return []
605
+ def decode_qr_code(image_path: str) -> Optional[str]:
606
+ """Decode QR code from an image file using OpenCV with improved binary handling"""
607
+ try:
608
+ # Read image using OpenCV
609
+ img = cv2.imread(image_path)
610
+ if img is None:
611
+ logger.error(f"Failed to read image: {image_path}")
 
 
 
 
 
 
612
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
613
 
614
+ # Convert to grayscale
615
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
616
+
617
+ # Initialize QRCode detector
618
+ detector = cv2.QRCodeDetector()
619
+
620
+ # Detect and decode
621
+ data, vertices, _ = detector.detectAndDecode(gray)
622
+
623
+ if vertices is not None and data:
624
+ # Check if this might be binary data (like a PDF)
625
+ if data.startswith("%PDF") or not all(ord(c) < 128 for c in data):
626
+ # This is likely binary data, encode as base64
627
+ try:
628
+ # If it's already a string representation, convert to bytes first
629
+ if isinstance(data, str):
630
+ data_bytes = data.encode('latin-1') # Use latin-1 to preserve byte values
631
+ else:
632
+ data_bytes = data
633
+
634
+ # Encode as base64
635
+ base64_data = base64.b64encode(data_bytes).decode('ascii')
636
+ return f"base64:{base64_data}"
637
+ except Exception as e:
638
+ logger.error(f"Error encoding binary data: {e}")
639
 
640
+ return data
641
+
642
+ logger.warning("No QR code found in image")
643
+ return None
644
+ except Exception as e:
645
+ logger.error(f"QR decoding error: {e}")
646
+ return None
 
 
 
 
 
 
 
 
 
 
 
647
 
648
+ # Also update the datachat_interface function to handle base64 data
649
  def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: str, query: str) -> str:
650
+ """Interface for DataChat functionality with binary data support"""
651
  data = None
652
  if data_source == "JSON Input":
653
  data = json_input
654
  elif data_source == "QR Code":
655
  try:
656
  decoded_data = decode_qr_code(qr_image)
657
+
658
+ # Handle base64 encoded data
659
+ if decoded_data and decoded_data.startswith("base64:"):
660
+ base64_part = decoded_data[7:] # Remove the "base64:" prefix
661
+ try:
662
+ # For PDFs and other binary data, provide info about the content
663
+ binary_data = base64.b64decode(base64_part)
664
+ if binary_data.startswith(b"%PDF"):
665
+ data = "The QR code contains a PDF document. Binary data cannot be processed directly."
666
+ else:
667
+ # Try to decode as text as a fallback
668
+ data = binary_data.decode('utf-8', errors='replace')
669
+ except Exception as e:
670
+ logger.error(f"Error processing base64 data: {e}")
671
+ data = "The QR code contains binary data that cannot be processed directly."
672
+ else:
673
+ data = decoded_data
674
+
675
  if not data:
676
  return "No QR code found in the provided image."
677
  except Exception as e: