Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 23

Commit

ab01192

verified ·

1 Parent(s): 5bbbe92

Update app2.py

Browse files

Files changed (1) hide show

app2.py +7 -45

app2.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import json
 import os
 import re
-import time
 import logging
 import mimetypes
 from selenium import webdriver
-from chromedriver_py import binary_path # this will get you the path variable
 import concurrent.futures
 import string
 import zipfile
@@ -17,7 +16,6 @@ from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
-from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from ratelimit import limits, sleep_and_retry
@@ -27,9 +25,6 @@ import nest_asyncio
 nest_asyncio.apply()
 import aiohttp
-svc = webdriver.ChromeService(executable_path=binary_path)
-driver = webdriver.Chrome(service=svc)
 # Setup logging
 logging.basicConfig(
     level=logging.INFO,
@@ -49,7 +44,7 @@ class URLProcessor:
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
-            'User-Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
@@ -77,7 +72,7 @@ class URLProcessor:
             return cleaned_text
         except Exception as e:
             logger.warning(f"Text cleaning error: {e}. Using fallback method.")
-            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Remove control characters
             text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
             text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
             return text.strip()
@@ -97,15 +92,10 @@ class URLProcessor:
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
-            # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
-            # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
-            # Standard HTML processing
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
@@ -153,12 +143,9 @@ class URLProcessor:
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
-            # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
             if main_content is None:
@@ -169,7 +156,6 @@ class URLProcessor:
                     'timestamp': datetime.now().isoformat()
                 }
-            # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
@@ -206,7 +192,7 @@ class FileProcessor:
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
-            if file_size > self.max_file_size:
                 logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
@@ -250,21 +236,17 @@ class FileProcessor:
         try:
             file_stat = os.stat(file.name)
-            # For very large files, read in chunks and summarize
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
-                # Read first and last 1MB for extremely large files
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
-                    # Seek to the last 1MB
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
             else:
-                # Regular file processing
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read()
@@ -285,13 +267,10 @@ class FileProcessor:
 def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
     """Clean and validate JSON data"""
     try:
-        # If it's a string, try to parse it
         if isinstance(data, str):
-            # Remove any existing content and extra whitespace
             data = data.strip()
             data = json.loads(data)
-        # Convert to string and back to ensure proper JSON format
         cleaned = json.loads(json.dumps(data))
         return cleaned
     except json.JSONDecodeError as e:
@@ -308,7 +287,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
         output_dir.mkdir(parents=True, exist_ok=True)
         if combined:
-            # Generate single QR code for all data
             cleaned_data = clean_json(data)
             if cleaned_data:
                 qr = qrcode.QRCode(
@@ -326,7 +304,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
                 img.save(str(output_path))
                 return [str(output_path)]
         else:
-            # Generate separate QR codes for each item
             if isinstance(data, list):
                 paths = []
                 for idx, item in enumerate(data):
@@ -339,7 +316,8 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
                             border=4,
                         )
                         json_str = json.dumps(cleaned_item, ensure_ascii=False)
-                        qr.add_data(json_str)
                         qr.make(fit=True)
                         img = qr.make_image(fill_color="black", back_color="white")
@@ -348,7 +326,6 @@ def generate_qr_code(data: Union[str, Dict], combined: bool = True) -> List[str]
                         paths.append(str(output_path))
                 return paths
             else:
-                # Single item, not combined
                 cleaned_item = clean_json(data)
                 if cleaned_item:
                     qr = qrcode.QRCode(
@@ -459,10 +436,8 @@ def create_interface():
             try:
                 results = []
-                # Process text input first (since it's direct JSON)
                 if text and text.strip():
                     try:
-                        # Try to parse as JSON
                         json_data = json.loads(text)
                         if isinstance(json_data, list):
                             results.extend(json_data)
@@ -471,7 +446,6 @@ def create_interface():
                     except json.JSONDecodeError as e:
                         return None, [], f"❌ Invalid JSON format: {str(e)}"
-                # Process URLs if provided
                 if urls and urls.strip():
                     processor = URLProcessor()
                     url_list = re.split(r'[,\n]', urls)
@@ -489,14 +463,12 @@ def create_interface():
                                     'timestamp': datetime.now().isoformat()
                                 })
-                # Process files if provided
                 if file:
                     file_processor = FileProcessor()
                     file_results = file_processor.process_file(file)
                     if file_results:
                         results.extend(file_results)
-                # Generate QR codes
                 if results:
                     qr_paths = generate_qr_code(results, combined=combine)
                     if qr_paths:
@@ -514,7 +486,6 @@ def create_interface():
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
-        # Set up event handlers
         example_btn.click(load_example, outputs=[text_input])
         clear_btn.click(clear_input, outputs=[text_input])
         process_btn.click(
@@ -542,16 +513,9 @@ def create_interface():
     return interface
 def main():
-    # Configure system settings
     mimetypes.init()
-    # Create output directories
     Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
-    # Create and launch interface
     interface = create_interface()
-    # Launch with proper configuration
     interface.launch(
         server_name="0.0.0.0",
         server_port=8000,
@@ -562,6 +526,4 @@ def main():
     )
 if __name__ == "__main__":
-    main()
-    app.interface

 import json
 import os
 import re
 import logging
 import mimetypes
 from selenium import webdriver
+from chromedriver_py import binary_path
 import concurrent.futures
 import string
 import zipfile
 import requests
 import validators
 import gradio as gr
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from ratelimit import limits, sleep_and_retry
 nest_asyncio.apply()
 import aiohttp
 # Setup logging
 logging.basicConfig(
     level=logging.INFO,
         self.session = requests.Session()
         self.timeout = 10  # seconds
         self.session.headers.update({
+            'User -Agent': UserAgent().random,
             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
             'Accept-Language': 'en-US,en;q=0.5',
             'Accept-Encoding': 'gzip, deflate, br',
             return cleaned_text
         except Exception as e:
             logger.warning(f"Text cleaning error: {e}. Using fallback method.")
+            text = re.sub(r'[\x00 -\x1F\x7F-\x9F]', '', text)  # Remove control characters
             text = text.encode('ascii', 'ignore').decode('ascii')  # Remove non-ASCII characters
             text = re.sub(r'\s+', ' ', text)  # Normalize whitespace
             return text.strip()
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
             return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
             main_content = soup.find('main') or soup.find('article') or soup.body
             if main_content is None:
                     'timestamp': datetime.now().isoformat()
                 }
             text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
         dataset = []
         try:
             file_size = os.path.getsize(file.name)
+ if file_size > self.max_file_size:
                 logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
                 return []
         try:
             file_stat = os.stat(file.name)
             if file_stat.st_size > 100 * 1024 * 1024:  # 100MB
                 logger.info(f"Processing large file: {file.name} ({file_stat.st_size} bytes)")
                 content = ""
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read(1 * 1024 * 1024)  # First 1MB
                     content += "\n...[Content truncated due to large file size]...\n"
                     f.seek(max(0, file_stat.st_size - 1 * 1024 * 1024))
                     content += f.read()  # Last 1MB
             else:
                 with open(file.name, 'r', encoding='utf-8', errors='ignore') as f:
                     content = f.read()
 def clean_json(data: Union[str, Dict]) -> Optional[Dict]:
     """Clean and validate JSON data"""
     try:
         if isinstance(data, str):
             data = data.strip()
             data = json.loads(data)
         cleaned = json.loads(json.dumps(data))
         return cleaned
     except json.JSONDecodeError as e:
         output_dir.mkdir(parents=True, exist_ok=True)
         if combined:
             cleaned_data = clean_json(data)
             if cleaned_data:
                 qr = qrcode.QRCode(
                 img.save(str(output_path))
                 return [str(output_path)]
         else:
             if isinstance(data, list):
                 paths = []
                 for idx, item in enumerate(data):
                             border=4,
                         )
                         json_str = json.dumps(cleaned_item, ensure_ascii=False)
+                        qr.add_data(json ```python
+_str)
                         qr.make(fit=True)
                         img = qr.make_image(fill_color="black", back_color="white")
                         paths.append(str(output_path))
                 return paths
             else:
                 cleaned_item = clean_json(data)
                 if cleaned_item:
                     qr = qrcode.QRCode(
             try:
                 results = []
                 if text and text.strip():
                     try:
                         json_data = json.loads(text)
                         if isinstance(json_data, list):
                             results.extend(json_data)
                     except json.JSONDecodeError as e:
                         return None, [], f"❌ Invalid JSON format: {str(e)}"
                 if urls and urls.strip():
                     processor = URLProcessor()
                     url_list = re.split(r'[,\n]', urls)
                                     'timestamp': datetime.now().isoformat()
                                 })
                 if file:
                     file_processor = FileProcessor()
                     file_results = file_processor.process_file(file)
                     if file_results:
                         results.extend(file_results)
                 if results:
                     qr_paths = generate_qr_code(results, combined=combine)
                     if qr_paths:
                 logger.error(f"Processing error: {e}")
                 return None, [], f"❌ Error: {str(e)}"
         example_btn.click(load_example, outputs=[text_input])
         clear_btn.click(clear_input, outputs=[text_input])
         process_btn.click(
     return interface
 def main():
     mimetypes.init()
     Path('output/qr_codes').mkdir(parents=True, exist_ok=True)
     interface = create_interface()
     interface.launch(
         server_name="0.0.0.0",
         server_port=8000,
     )
 if __name__ == "__main__":
+    main()