Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 23

Commit

e3f9294

verified ·

1 Parent(s): 47e960f

Update app2.py

Browse files

Files changed (1) hide show

app2.py +106 -9

app2.py CHANGED Viewed

@@ -33,18 +33,79 @@ logger = logging.getLogger(__name__)
 # Ensure output directories exist
 Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
 class URLProcessor:
     def __init__(self):
         self.session = requests.Session()
-        self.timeout = 10  # seconds
         self.session.headers.update({
-            'User -Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
@@ -341,12 +402,48 @@ class FileProcessor:
             logger.error(f"QR generation error: {e}")
             return []
-    def decode_qr(image):
-        decoded_objects = decode(image)
-        results = []
-        for obj in decoded_objects:
-            results.append(obj.data.decode('utf-8'))
-        return results
             raise ValueError("Unable to decode QR code")
         except Exception as e:
@@ -437,4 +534,4 @@ iface = gr.Interface(fn=decode_qr, inputs="image", outputs="text")
 if __name__ == "__main__":
-    main()

 # Ensure output directories exist
 Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
+# At the top of the file, remove these imports:
+# from config import Config
+# from proxy_handler import ProxyHandler
+# from robots_handler import RobotsHandler
 class URLProcessor:
     def __init__(self):
         self.session = requests.Session()
+        self.timeout = 10
+        self.max_retries = 3
+        self.request_delay = 1.0
+        self.respect_robots = True
+        self.use_proxy = False
+        self.proxy_url = None
+        # Update session headers
         self.session.headers.update({
+            'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
+        if self.use_proxy and self.proxy_url:
+            self.session.proxies = {
+                'http': self.proxy_url,
+                'https': self.proxy_url
+            }
+    def check_robots_txt(self, url: str) -> bool:
+        """Check if URL is allowed by robots.txt"""
+        if not self.respect_robots:
+            return True
+        try:
+            from urllib.parse import urlparse
+            from urllib.robotparser import RobotFileParser
+            parsed_url = urlparse(url)
+            robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt"
+            rp = RobotFileParser()
+            rp.set_url(robots_url)
+            rp.read()
+            return rp.can_fetch(self.session.headers['User-Agent'], url)
+        except Exception as e:
+            logger.warning(f"Error checking robots.txt: {e}")
+            return True
+    def fetch_content(self, url: str) -> Optional[Dict]:
+        """Fetch content with built-in rate limiting and robots.txt checking"""
+        if not self.check_robots_txt(url):
+            logger.warning(f"URL {url} is disallowed by robots.txt")
+            return None
+        time.sleep(self.request_delay)  # Basic rate limiting
+        for attempt in range(self.max_retries):
+            try:
+                if 'drive.google.com' in url:
+                    return self._handle_google_drive(url)
+                if 'calendar.google.com' in url:
+                    return self._handle_google_calendar(url)
+                return self._fetch_html_content(url)
+            except Exception as e:
+                logger.error(f"Attempt {attempt + 1} failed: {e}")
+                if attempt < self.max_retries - 1:
+                    time.sleep(self.request_delay * (attempt + 1))
+        return None
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
             logger.error(f"QR generation error: {e}")
             return []
+    def decode_qr_code(image_path: str) -> Optional[str]:
+        """Decode QR code from an image file"""
+        try:
+            # Open and convert image to grayscale for better QR detection
+            img = Image.open(image_path).convert('L')
+            decoded_objects = decode(img)
+            if decoded_objects:
+                return decoded_objects[0].data.decode('utf-8')
+            logger.warning("No QR code found in image")
+            return None
+        except Exception as e:
+            logger.error(f"QR decoding error: {e}")
+            return None
+    # Replace the existing decode_qr function with this one
+    def decode_qr(image) -> List[str]:
+        """Decode all QR codes found in an image"""
+        try:
+            # Convert to PIL Image if needed
+            if not isinstance(image, Image.Image):
+                image = Image.fromarray(image)
+            # Convert to grayscale for better detection
+            image = image.convert('L')
+            # Decode QR codes
+            decoded_objects = decode(image)
+            results = []
+            for obj in decoded_objects:
+                try:
+                    decoded_text = obj.data.decode('utf-8')
+                    results.append(decoded_text)
+                except UnicodeDecodeError:
+                    logger.warning("Failed to decode QR code data as UTF-8")
+                    continue
+            return results
+        except Exception as e:
+            logger.error(f"QR decoding error: {e}")
+            return []
             raise ValueError("Unable to decode QR code")
         except Exception as e:
 if __name__ == "__main__":
+    main()