Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 27

Commit

d139998

verified ·

1 Parent(s): 431e17b

Update app.py

Browse files

Files changed (1) hide show

app.py +175 -18

app.py CHANGED Viewed

@@ -12,11 +12,18 @@ from typing import List, Dict, Tuple, Union, Optional
 import requests
 import validators
 import gradio as gr
-import zxing
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -41,6 +48,13 @@ class URLProcessor:
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
@@ -80,18 +94,31 @@ class URLProcessor:
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
-        """Universal content fetcher with special case handling"""
         try:
-            # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
-            # Google Calendar ICS handling
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
-            # Standard HTML processing
-            return self._fetch_html_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             return None
@@ -131,9 +158,8 @@ class URLProcessor:
             logger.error(f"Calendar fetch failed: {e}")
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
-        """Standard HTML content processing"""
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
@@ -147,14 +173,13 @@ class URLProcessor:
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
-            # Check if main_content is None
-            if main_content is None:
-                logger.warning(f"No main content found in the HTML for URL: {url}")
-                return {
-                    'content': "No main content found.",
-                    'content_type': response.headers.get('Content-Type', ''),
-                    'timestamp': datetime.now().isoformat()
-                }
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
@@ -162,12 +187,144 @@ class URLProcessor:
             return {
                 'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
 class FileProcessor:
     """Class to handle file processing"""

 import requests
 import validators
 import gradio as gr
+from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from cleantext import clean
 import qrcode
+import PyPDF2
+from PIL import Image
+import pytesseract
+import cv2
+import numpy as np
+import fitz  # PyMuPDF
+import zipfile
 # Setup logging with detailed configuration
 logging.basicConfig(
             'Connection': 'keep-alive',
             'Upgrade-Insecure-Requests': '1'
         })
+        self.supported_content_types = {
+            'text/html': self._fetch_html_content,
+            'application/pdf': self._fetch_pdf_content,
+            'image': self._fetch_image_content,
+            'application/json': self._fetch_json_content,
+            'text/plain': self._fetch_text_content
+        }
     def advanced_text_cleaning(self, text: str) -> str:
         """Robust text cleaning with version compatibility"""
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
+        """Universal content fetcher with enhanced content type handling"""
         try:
+            # Special case handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
             if 'calendar.google.com' in url and 'ical' in url:
                 return self._handle_google_calendar(url)
+            # Get content type
+            response = self.session.head(url, timeout=self.timeout)
+            content_type = response.headers.get('Content-Type', '').split(';')[0].lower()
+            # Find appropriate handler
+            handler = None
+            for supported_type, type_handler in self.supported_content_types.items():
+                if content_type.startswith(supported_type):
+                    handler = type_handler
+                    break
+            if handler:
+                return handler(url)
+            else:
+                logger.warning(f"Unsupported content type: {content_type}")
+                return self._fetch_text_content(url)
         except Exception as e:
             logger.error(f"Content fetch failed: {e}")
             return None
             logger.error(f"Calendar fetch failed: {e}")
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Enhanced HTML content processing with metadata extraction"""
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             # Extract main content
             main_content = soup.find('main') or soup.find('article') or soup.body
+            # Extract metadata
+            metadata = {
+                'title': soup.title.string if soup.title else None,
+                'description': soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else None,
+                'keywords': soup.find('meta', {'name': 'keywords'})['content'] if soup.find('meta', {'name': 'keywords'}) else None,
+                'author': soup.find('meta', {'name': 'author'})['content'] if soup.find('meta', {'name': 'author'}) else None
+            }
             # Clean and structure content
             text_content = main_content.get_text(separator='\n', strip=True)
             return {
                 'content': cleaned_content,
+                'metadata': metadata,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
             logger.error(f"HTML processing failed: {e}")
             return None
+    def _fetch_pdf_content(self, url: str) -> Optional[Dict]:
+        """Process PDF content with enhanced metadata extraction"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_file:
+                temp_file.write(response.content)
+                temp_file.flush()
+                # Extract text and metadata using PyMuPDF
+                doc = fitz.open(temp_file.name)
+                # Extract text with formatting preservation
+                text = ""
+                metadata = {
+                    'title': doc.metadata.get('title'),
+                    'author': doc.metadata.get('author'),
+                    'subject': doc.metadata.get('subject'),
+                    'keywords': doc.metadata.get('keywords'),
+                    'creator': doc.metadata.get('creator'),
+                    'producer': doc.metadata.get('producer'),
+                    'page_count': len(doc),
+                    'file_size': os.path.getsize(temp_file.name),
+                    'version': doc.version
+                }
+                # Extract text with layout preservation
+                for page in doc:
+                    blocks = page.get_text("blocks")
+                    for block in blocks:
+                        if block[6] == 0:  # Text block
+                            text += block[4] + "\n"
+                doc.close()
+                cleaned_content = self.advanced_text_cleaning(text)
+                return {
+                    'content': cleaned_content,
+                    'metadata': metadata,
+                    'content_type': 'application/pdf',
+                    'timestamp': datetime.now().isoformat()
+                }
+        except Exception as e:
+            logger.error(f"PDF processing failed: {e}")
+            return None
+    def _fetch_image_content(self, url: str) -> Optional[Dict]:
+        """Process image content with OCR and advanced image processing"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            with tempfile.NamedTemporaryFile(suffix='.jpg') as temp_file:
+                temp_file.write(response.content)
+                temp_file.flush()
+                # Load image with OpenCV
+                img = cv2.imread(temp_file.name)
+                if img is None:
+                    raise ValueError("Failed to load image")
+                # Image preprocessing for better OCR
+                gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+                denoised = cv2.fastNlMeansDenoising(gray)
+                thresh = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
+                # Extract text using Tesseract
+                text = pytesseract.image_to_string(thresh)
+                cleaned_text = self.advanced_text_cleaning(text) if text else None
+                # Extract metadata and additional image features
+                with Image.open(temp_file.name) as pil_img:
+                    exif = pil_img._getexif() if hasattr(pil_img, '_getexif') else None
+                    metadata = {
+                        'format': pil_img.format,
+                        'mode': pil_img.mode,
+                        'size': pil_img.size,
+                        'exif': exif,
+                        'image_features': {
+                            'resolution': img.shape,
+                            'channels': img.shape[2] if len(img.shape) > 2 else 1,
+                            'mean_brightness': np.mean(gray),
+                            'has_text': bool(cleaned_text and cleaned_text.strip())
+                        }
+                    }
+                return {
+                    'content': cleaned_text,
+                    'metadata': metadata,
+                    'content_type': response.headers.get('Content-Type', ''),
+                    'timestamp': datetime.now().isoformat()
+                }
+        except Exception as e:
+            logger.error(f"Image processing failed: {e}")
+            return None
+    def _fetch_json_content(self, url: str) -> Optional[Dict]:
+        """Process JSON content"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            content = response.json()
+            return {
+                'content': json.dumps(content, indent=2),
+                'content_type': 'application/json',
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"JSON processing failed: {e}")
+            return None
+    def _fetch_text_content(self, url: str) -> Optional[Dict]:
+        """Process plain text content"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            cleaned_content = self.advanced_text_cleaning(response.text)
+            return {
+                'content': cleaned_content,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Text processing failed: {e}")
+            return None
 class FileProcessor:
     """Class to handle file processing"""