Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 12

Commit

29808fd

verified ·

1 Parent(s): 2e3f355

Update app.py

Browse files

Files changed (1) hide show

app.py +79 -76

app.py CHANGED Viewed

@@ -13,10 +13,11 @@ from urllib.parse import urlparse
 import requests
 import validators
 import gradio as gr
-import cachetools
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from ratelimit import limits, sleep_and_retry
 # Setup logging with detailed configuration
 logging.basicConfig(
@@ -30,17 +31,13 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 class URLProcessor:
-    """Class to handle URL processing with advanced features"""
-    def __init__(self, timeout: int = 15, max_retries: int = 3, concurrent_requests: int = 5, cache_size: int = 100):
-        self.timeout = timeout
-        self.max_retries = max_retries
-        self.concurrent_requests = concurrent_requests
-        self.ua = UserAgent()  # Initialize UserAgent
-        # Implement multilevel caching
-        self.url_cache = cachetools.LRUCache(maxsize=cache_size)
-        self.content_cache = cachetools.TTLCache(maxsize=cache_size, ttl=3600)  # 1-hour cache
         self.session = requests.Session()
         self.session.headers.update({
@@ -97,63 +94,64 @@ class URLProcessor:
     @sleep_and_retry
     @limits(calls=20, period=60)  # Refined rate limiting
-    def fetch_content(self, url: str) -> Optional[str]:
-        """Fetch content from URL with retry mechanism"""
-        # Check content cache first
-        if url in self.content_cache:
-            return self.content_cache[url]
-        for attempt in range(self.max_retries):
-            try:
-                response = self.session.get(url, timeout=self.timeout)
-                response.raise_for_status()
-                # Use BeautifulSoup for more robust parsing
-                soup = BeautifulSoup(response.text, 'html.parser')
-                # Remove scripts, styles, comments
-                for script in soup(["script", "style"]):
-                    script.decompose()
-                # Extract clean text
-                text = soup.get_text(separator=' ')
-                cleaned_text = self .advanced_text_cleaning(text)
-                # Cache the result
-                self.content_cache[url] = cleaned_text
-                return cleaned_text
-            except requests.RequestException as e:
-                logger.warning(f"Fetch attempt {attempt + 1} failed for {url}: {e}")
-                time.sleep(2 ** attempt)  # Exponential backoff
-        return None
     def advanced_text_cleaning(self, text: str) -> str:
-        """Sophisticated text cleaning and normalization"""
-        if not text:
-            return ""
-        # Remove control characters
-        text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
-        # Normalize Unicode characters
-        text = text.encode('ascii', 'ignore').decode('ascii')
-        # Replace multiple whitespaces
-        text = re.sub(r'\s+', ' ', text)
-        # Remove HTML entities
-        text = re.sub(r'&[a-zA-Z]+;', '', text)
-        # Normalize quotation marks
-        text = text.replace('"', '"').replace('"', '"')
-        text = text.replace('‘', "'").replace('’', "'")
-        # Remove excessive punctuation
-        text = re.sub(r'([.,!?]){2,}', r'\1', text)
-        return text.strip()
 class FileProcessor:
     """Class to handle file processing"""
@@ -170,6 +168,8 @@ class FileProcessor:
         except Exception:
             return False
     def process_file(self, file) -> List[Dict]:
         """Process uploaded file with enhanced error handling"""
         if not file:
@@ -218,20 +218,23 @@ class FileProcessor:
         return results
     def _process_single_file(self, file) -> List[Dict]:
-        """Process single file"""
-        results = []
         try:
             content = file.read().decode('utf-8', errors='ignore')
-            if content.strip():
-                results.append({
-                    "source": "file",
-                    "filename": os.path.basename(file.name),
-                    "content": content,
-                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
-                })
         except Exception as e:
-            logger.error(f"Error processing single file: {str(e)}")
-        return results
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""

 import requests
 import validators
 import gradio as gr
+from diskcache import Cache
 from bs4 import BeautifulSoup
 from fake_useragent import UserAgent
 from ratelimit import limits, sleep_and_retry
+from cleantext import clean
 # Setup logging with detailed configuration
 logging.basicConfig(
 logger = logging.getLogger(__name__)
 class URLProcessor:
+    def __init__(self, timeout=15, max_retries=3, concurrent_requests=5, cache_dir='cache'):
+        self.cache_dir = Path(cache_dir)
+        self.cache_dir.mkdir(exist_ok=True)
+        # Persistent disk-based caches
+        self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
+        self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
         self.session = requests.Session()
         self.session.headers.update({
     @sleep_and_retry
     @limits(calls=20, period=60)  # Refined rate limiting
+    def fetch_content(self, url: str) -> Optional[Dict]:
+        cached = self.content_cache.get(url)
+        if cached:
+            return cached
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # Extract structured elements
+            title = soup.title.text.strip() if soup.title else ''
+            meta_desc = soup.find('meta', {'name': 'description'})
+            description = meta_desc['content'].strip() if meta_desc else ''
+            headings = [{'level': h.name, 'text': h.text.strip()}
+                       for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
+            links = [a['href'] for a in soup.find_all('a', href=True)
+                    if validators.url(a['href'])]
+            # Main content extraction
+            for element in soup(['script', 'style', 'nav', 'footer']):
+                element.decompose()
+            main_content = soup.find('main') or soup.find('article') or soup.body
+            text = main_content.get_text(separator=' ') if main_content else ''
+            structured_data = {
+                'title': title,
+                'description': description,
+                'headings': headings,
+                'links': links,
+                'content': self.advanced_text_cleaning(text),
+                'status_code': response.status_code,
+                'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat()
+            }
+            self.content_cache.set(url, structured_data, expire=3600)
+            return structured_data
+        except Exception as e:
+            logger.error(f"Error fetching {url}: {e}")
+            return None
     def advanced_text_cleaning(self, text: str) -> str:
+        return clean(text,
+            fix_unicode=True,
+            to_ascii=True,
+            lower=True,
+            no_line_breaks=True,
+            no_urls=True,
+            no_emails=True,
+            no_phone_numbers=True,
+            no_numbers=False,
+            no_digits=False,
+            no_currency_symbols=True,
+            no_punct=False
+        ).strip()
 class FileProcessor:
     """Class to handle file processing"""
         except Exception:
             return False
     def process_file(self, file) -> List[Dict]:
         """Process uploaded file with enhanced error handling"""
         if not file:
         return results
     def _process_single_file(self, file) -> List[Dict]:
         try:
+            file_stat = os.stat(file.name)
             content = file.read().decode('utf-8', errors='ignore')
+            return [{
+                'source': 'file',
+                'filename': os.path.basename(file.name),
+                'file_size': file_stat.st_size,
+                'mime_type': mimetypes.guess_type(file.name)[0],
+                'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
+                'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
+                'content': content,
+                'timestamp': datetime.now().isoformat()
+            }]
         except Exception as e:
+            logger.error(f"File processing error: {e}")
+            return []
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""