acecalisto3 commited on
Commit
e3f9294
·
verified ·
1 Parent(s): 47e960f

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +106 -9
app2.py CHANGED
@@ -33,18 +33,79 @@ logger = logging.getLogger(__name__)
33
  # Ensure output directories exist
34
  Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
35
 
 
 
 
 
 
36
  class URLProcessor:
37
  def __init__(self):
38
  self.session = requests.Session()
39
- self.timeout = 10 # seconds
 
 
 
 
 
 
 
40
  self.session.headers.update({
41
- 'User -Agent': UserAgent().random,
42
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
43
  'Accept-Language': 'en-US,en;q=0.5',
44
  'Accept-Encoding': 'gzip, deflate, br',
45
  'Connection': 'keep-alive',
46
  'Upgrade-Insecure-Requests': '1'
47
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def advanced_text_cleaning(self, text: str) -> str:
50
  """Robust text cleaning with version compatibility"""
@@ -341,12 +402,48 @@ class FileProcessor:
341
  logger.error(f"QR generation error: {e}")
342
  return []
343
 
344
- def decode_qr(image):
345
- decoded_objects = decode(image)
346
- results = []
347
- for obj in decoded_objects:
348
- results.append(obj.data.decode('utf-8'))
349
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
 
351
  raise ValueError("Unable to decode QR code")
352
  except Exception as e:
@@ -437,4 +534,4 @@ iface = gr.Interface(fn=decode_qr, inputs="image", outputs="text")
437
 
438
 
439
  if __name__ == "__main__":
440
- main()
 
33
  # Ensure output directories exist
34
  Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
35
 
36
+ # At the top of the file, remove these imports:
37
+ # from config import Config
38
+ # from proxy_handler import ProxyHandler
39
+ # from robots_handler import RobotsHandler
40
+
41
  class URLProcessor:
42
  def __init__(self):
43
  self.session = requests.Session()
44
+ self.timeout = 10
45
+ self.max_retries = 3
46
+ self.request_delay = 1.0
47
+ self.respect_robots = True
48
+ self.use_proxy = False
49
+ self.proxy_url = None
50
+
51
+ # Update session headers
52
  self.session.headers.update({
53
+ 'User-Agent': UserAgent().random,
54
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
55
  'Accept-Language': 'en-US,en;q=0.5',
56
  'Accept-Encoding': 'gzip, deflate, br',
57
  'Connection': 'keep-alive',
58
  'Upgrade-Insecure-Requests': '1'
59
  })
60
+
61
+ if self.use_proxy and self.proxy_url:
62
+ self.session.proxies = {
63
+ 'http': self.proxy_url,
64
+ 'https': self.proxy_url
65
+ }
66
+
67
+ def check_robots_txt(self, url: str) -> bool:
68
+ """Check if URL is allowed by robots.txt"""
69
+ if not self.respect_robots:
70
+ return True
71
+
72
+ try:
73
+ from urllib.parse import urlparse
74
+ from urllib.robotparser import RobotFileParser
75
+
76
+ parsed_url = urlparse(url)
77
+ robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
78
+
79
+ rp = RobotFileParser()
80
+ rp.set_url(robots_url)
81
+ rp.read()
82
+
83
+ return rp.can_fetch(self.session.headers['User-Agent'], url)
84
+ except Exception as e:
85
+ logger.warning(f"Error checking robots.txt: {e}")
86
+ return True
87
+
88
+ def fetch_content(self, url: str) -> Optional[Dict]:
89
+ """Fetch content with built-in rate limiting and robots.txt checking"""
90
+ if not self.check_robots_txt(url):
91
+ logger.warning(f"URL {url} is disallowed by robots.txt")
92
+ return None
93
+
94
+ time.sleep(self.request_delay) # Basic rate limiting
95
+
96
+ for attempt in range(self.max_retries):
97
+ try:
98
+ if 'drive.google.com' in url:
99
+ return self._handle_google_drive(url)
100
+ if 'calendar.google.com' in url:
101
+ return self._handle_google_calendar(url)
102
+ return self._fetch_html_content(url)
103
+ except Exception as e:
104
+ logger.error(f"Attempt {attempt + 1} failed: {e}")
105
+ if attempt < self.max_retries - 1:
106
+ time.sleep(self.request_delay * (attempt + 1))
107
+
108
+ return None
109
 
110
  def advanced_text_cleaning(self, text: str) -> str:
111
  """Robust text cleaning with version compatibility"""
 
402
  logger.error(f"QR generation error: {e}")
403
  return []
404
 
405
+ def decode_qr_code(image_path: str) -> Optional[str]:
406
+ """Decode QR code from an image file"""
407
+ try:
408
+ # Open and convert image to grayscale for better QR detection
409
+ img = Image.open(image_path).convert('L')
410
+ decoded_objects = decode(img)
411
+
412
+ if decoded_objects:
413
+ return decoded_objects[0].data.decode('utf-8')
414
+ logger.warning("No QR code found in image")
415
+ return None
416
+ except Exception as e:
417
+ logger.error(f"QR decoding error: {e}")
418
+ return None
419
+
420
+ # Replace the existing decode_qr function with this one
421
+ def decode_qr(image) -> List[str]:
422
+ """Decode all QR codes found in an image"""
423
+ try:
424
+ # Convert to PIL Image if needed
425
+ if not isinstance(image, Image.Image):
426
+ image = Image.fromarray(image)
427
+
428
+ # Convert to grayscale for better detection
429
+ image = image.convert('L')
430
+
431
+ # Decode QR codes
432
+ decoded_objects = decode(image)
433
+ results = []
434
+
435
+ for obj in decoded_objects:
436
+ try:
437
+ decoded_text = obj.data.decode('utf-8')
438
+ results.append(decoded_text)
439
+ except UnicodeDecodeError:
440
+ logger.warning("Failed to decode QR code data as UTF-8")
441
+ continue
442
+
443
+ return results
444
+ except Exception as e:
445
+ logger.error(f"QR decoding error: {e}")
446
+ return []
447
 
448
  raise ValueError("Unable to decode QR code")
449
  except Exception as e:
 
534
 
535
 
536
  if __name__ == "__main__":
537
+ main()