Spaces:
Running
Running
Update app2.py
Browse files
app2.py
CHANGED
@@ -33,18 +33,79 @@ logger = logging.getLogger(__name__)
|
|
33 |
# Ensure output directories exist
|
34 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
35 |
|
|
|
|
|
|
|
|
|
|
|
36 |
class URLProcessor:
|
37 |
def __init__(self):
|
38 |
self.session = requests.Session()
|
39 |
-
self.timeout = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
self.session.headers.update({
|
41 |
-
'User
|
42 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
43 |
'Accept-Language': 'en-US,en;q=0.5',
|
44 |
'Accept-Encoding': 'gzip, deflate, br',
|
45 |
'Connection': 'keep-alive',
|
46 |
'Upgrade-Insecure-Requests': '1'
|
47 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def advanced_text_cleaning(self, text: str) -> str:
|
50 |
"""Robust text cleaning with version compatibility"""
|
@@ -341,12 +402,48 @@ class FileProcessor:
|
|
341 |
logger.error(f"QR generation error: {e}")
|
342 |
return []
|
343 |
|
344 |
-
def
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
350 |
|
351 |
raise ValueError("Unable to decode QR code")
|
352 |
except Exception as e:
|
@@ -437,4 +534,4 @@ iface = gr.Interface(fn=decode_qr, inputs="image", outputs="text")
|
|
437 |
|
438 |
|
439 |
if __name__ == "__main__":
|
440 |
-
main()
|
|
|
33 |
# Ensure output directories exist
|
34 |
Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
|
35 |
|
36 |
+
# At the top of the file, remove these imports:
|
37 |
+
# from config import Config
|
38 |
+
# from proxy_handler import ProxyHandler
|
39 |
+
# from robots_handler import RobotsHandler
|
40 |
+
|
41 |
class URLProcessor:
|
42 |
def __init__(self):
|
43 |
self.session = requests.Session()
|
44 |
+
self.timeout = 10
|
45 |
+
self.max_retries = 3
|
46 |
+
self.request_delay = 1.0
|
47 |
+
self.respect_robots = True
|
48 |
+
self.use_proxy = False
|
49 |
+
self.proxy_url = None
|
50 |
+
|
51 |
+
# Update session headers
|
52 |
self.session.headers.update({
|
53 |
+
'User-Agent': UserAgent().random,
|
54 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
55 |
'Accept-Language': 'en-US,en;q=0.5',
|
56 |
'Accept-Encoding': 'gzip, deflate, br',
|
57 |
'Connection': 'keep-alive',
|
58 |
'Upgrade-Insecure-Requests': '1'
|
59 |
})
|
60 |
+
|
61 |
+
if self.use_proxy and self.proxy_url:
|
62 |
+
self.session.proxies = {
|
63 |
+
'http': self.proxy_url,
|
64 |
+
'https': self.proxy_url
|
65 |
+
}
|
66 |
+
|
67 |
+
def check_robots_txt(self, url: str) -> bool:
|
68 |
+
"""Check if URL is allowed by robots.txt"""
|
69 |
+
if not self.respect_robots:
|
70 |
+
return True
|
71 |
+
|
72 |
+
try:
|
73 |
+
from urllib.parse import urlparse
|
74 |
+
from urllib.robotparser import RobotFileParser
|
75 |
+
|
76 |
+
parsed_url = urlparse(url)
|
77 |
+
robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
|
78 |
+
|
79 |
+
rp = RobotFileParser()
|
80 |
+
rp.set_url(robots_url)
|
81 |
+
rp.read()
|
82 |
+
|
83 |
+
return rp.can_fetch(self.session.headers['User-Agent'], url)
|
84 |
+
except Exception as e:
|
85 |
+
logger.warning(f"Error checking robots.txt: {e}")
|
86 |
+
return True
|
87 |
+
|
88 |
+
def fetch_content(self, url: str) -> Optional[Dict]:
|
89 |
+
"""Fetch content with built-in rate limiting and robots.txt checking"""
|
90 |
+
if not self.check_robots_txt(url):
|
91 |
+
logger.warning(f"URL {url} is disallowed by robots.txt")
|
92 |
+
return None
|
93 |
+
|
94 |
+
time.sleep(self.request_delay) # Basic rate limiting
|
95 |
+
|
96 |
+
for attempt in range(self.max_retries):
|
97 |
+
try:
|
98 |
+
if 'drive.google.com' in url:
|
99 |
+
return self._handle_google_drive(url)
|
100 |
+
if 'calendar.google.com' in url:
|
101 |
+
return self._handle_google_calendar(url)
|
102 |
+
return self._fetch_html_content(url)
|
103 |
+
except Exception as e:
|
104 |
+
logger.error(f"Attempt {attempt + 1} failed: {e}")
|
105 |
+
if attempt < self.max_retries - 1:
|
106 |
+
time.sleep(self.request_delay * (attempt + 1))
|
107 |
+
|
108 |
+
return None
|
109 |
|
110 |
def advanced_text_cleaning(self, text: str) -> str:
|
111 |
"""Robust text cleaning with version compatibility"""
|
|
|
402 |
logger.error(f"QR generation error: {e}")
|
403 |
return []
|
404 |
|
405 |
+
def decode_qr_code(image_path: str) -> Optional[str]:
|
406 |
+
"""Decode QR code from an image file"""
|
407 |
+
try:
|
408 |
+
# Open and convert image to grayscale for better QR detection
|
409 |
+
img = Image.open(image_path).convert('L')
|
410 |
+
decoded_objects = decode(img)
|
411 |
+
|
412 |
+
if decoded_objects:
|
413 |
+
return decoded_objects[0].data.decode('utf-8')
|
414 |
+
logger.warning("No QR code found in image")
|
415 |
+
return None
|
416 |
+
except Exception as e:
|
417 |
+
logger.error(f"QR decoding error: {e}")
|
418 |
+
return None
|
419 |
+
|
420 |
+
# Replace the existing decode_qr function with this one
|
421 |
+
def decode_qr(image) -> List[str]:
|
422 |
+
"""Decode all QR codes found in an image"""
|
423 |
+
try:
|
424 |
+
# Convert to PIL Image if needed
|
425 |
+
if not isinstance(image, Image.Image):
|
426 |
+
image = Image.fromarray(image)
|
427 |
+
|
428 |
+
# Convert to grayscale for better detection
|
429 |
+
image = image.convert('L')
|
430 |
+
|
431 |
+
# Decode QR codes
|
432 |
+
decoded_objects = decode(image)
|
433 |
+
results = []
|
434 |
+
|
435 |
+
for obj in decoded_objects:
|
436 |
+
try:
|
437 |
+
decoded_text = obj.data.decode('utf-8')
|
438 |
+
results.append(decoded_text)
|
439 |
+
except UnicodeDecodeError:
|
440 |
+
logger.warning("Failed to decode QR code data as UTF-8")
|
441 |
+
continue
|
442 |
+
|
443 |
+
return results
|
444 |
+
except Exception as e:
|
445 |
+
logger.error(f"QR decoding error: {e}")
|
446 |
+
return []
|
447 |
|
448 |
raise ValueError("Unable to decode QR code")
|
449 |
except Exception as e:
|
|
|
534 |
|
535 |
|
536 |
if __name__ == "__main__":
|
537 |
+
main()
|