Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 13

Commit

1336a84

verified ·

1 Parent(s): eed2b1e

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -130

app.py CHANGED Viewed

@@ -31,152 +31,90 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
 class URLProcessor:
-    """Class to handle URL processing with advanced features"""
-    def __init__(self, timeout=15, max_retries=3, cache_dir='cache'):
-        self.ua = UserAgent()  # Initialize UserAgent first
-        self.timeout = timeout
-        self.max_retries = max_retries
-        # Persistent caching setup
-        self.cache_dir = Path(cache_dir)
-        self.cache_dir.mkdir(exist_ok=True)
-        self.url_cache = Cache(str(self.cache_dir / 'url_cache'))
-        self.content_cache = Cache(str(self.cache_dir / 'content_cache'), size_limit=2**30)
-        # Session configuration
-        self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': self.ua.random,  # Correct header key
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-            'Accept-Language': 'en-US,en;q=0.5',
-            'Connection': 'keep-alive'
-        })
-    @sleep_and_retry
-    @limits(calls=10, period=60)  # Rate limiting: 10 calls per minute
-    def validate_url(self, url: str) -> Dict[str, Union[bool, str]]:
-        """Validate URL format and accessibility"""
         try:
-            # Check cache first
-            if url in self.url_cache:
-                return self.url_cache[url]
-            result = urlparse(url)
-            validation_result = {
-                'is_valid': False,
-                'message': 'Invalid URL',
-                'scheme': result.scheme,
-                'netloc': result.netloc
-            }
-            if not all([result.scheme, result.netloc]):
-                validation_result['message'] = 'Missing scheme or network location'
-                return validation_result
-            if not validators.url(url):
-                validation_result['message'] = 'URL format validation failed'
-                return validation_result
-            # Perform HEAD request for accessibility
-            try:
-                response = self.session.head(url, timeout=self.timeout, allow_redirects=True)
-                validation_result['is_valid'] = response.status_code in [200, 301, 302]
-                validation_result['status_code'] = response.status_code
-                validation_result['message'] = f"URL is {'valid' if validation_result['is_valid'] else 'invalid'}"
-            except requests.RequestException as e:
-                validation_result['message'] = f"Connection error: {str(e)}"
-            # Cache the result
-            self.url_cache[url] = validation_result
-            return validation_result
-        except Exception as e:
-            logger.error(f"Unexpected error validating URL {url}: {e}")
-            return {
-                'is_valid': False,
-                'message': f"Unexpected validation error: {str(e)}"
-            }
-    @sleep_and_retry
-    @limits(calls=20, period=60)  # Refined rate limiting
     def fetch_content(self, url: str) -> Optional[Dict]:
-        """Fetch and structure content from URL, handling Google Drive and Calendar links."""
         if 'drive.google.com' in url:
-            # Convert Google Drive URL to direct download link
             file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
-            if file_id:
-                url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            else:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
-        cached = self.content_cache.get(url)
-        if cached:
-            return cached
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            # Handle ICS files (Google Calendar)
-            if 'text/calendar' in response.headers.get('Content-Type', ''):
-                return {
-                    'content': response.text,
-                    'content_type': 'text/calendar',
-                    'timestamp': datetime.now().isoformat()
-                }
-            # Handle HTML content
-            soup = BeautifulSoup(response.text, 'html.parser')
-            title = soup.title.text.strip() if soup.title else ''
-            meta_desc = soup.find('meta', {'name': 'description'})
-            description = meta_desc['content'].strip() if meta_desc else ''
-            headings = [{'level': h.name, 'text': h.text.strip()}
-                       for h in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]
-            links = [a['href'] for a in soup.find_all('a', href=True)
-                    if validators.url(a['href'])]
-            # Main content extraction
-            for element in soup(['script', 'style', 'nav', 'footer']):
-                element.decompose()
-            main_content = soup.find('main') or soup.find('article') or soup.body
-            text = main_content.get_text(separator=' ') if main_content else ''
-            structured_data = {
-                'title': title,
-                'description': description,
-                'headings': headings,
-                'links': links,
-                'content': self.advanced_text_cleaning(text),
-                'status_code': response.status_code,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
-            self.content_cache.set(url, structured_data, expire=3600)
-            return structured_data
         except Exception as e:
-            logger.error(f"Error fetching {url}: {e}")
             return None
-    def advanced_text_cleaning(self, text: str) -> str:
-        return clean(text,
-            fix_unicode=True,
-            to_ascii=True,
-            lower=True,
-            no_line_breaks=True,
-            no_urls=True,
-            no_emails=True,
-            no_phone_numbers=True,
-            no_numbers=False,
-            no_digits=False,
-            no_currency_symbols=True,
-            no_punct=False
-        ).strip()
 class FileProcessor:
     """Class to handle file processing"""

 logger = logging.getLogger(__name__)
 class URLProcessor:
+    def advanced_text_cleaning(self, text: str) -> str:
+        """Robust text cleaning with version compatibility"""
         try:
+            # Modern clean-text parameters
+            return clean(text,
+                fix_unicode=True,
+                to_ascii=True,
+                lower=True,
+                no_line_breaks=True,
+                no_urls=True,
+                no_emails=True,
+                no_phone_numbers=True,
+                no_numbers=False,
+                no_digits=False,
+                no_currency_symbols=True,
+                no_punct=False
+            ).strip()
+        except TypeError as e:
+            # Fallback to basic cleaning
+            logger.warning("Using fallback text cleaning method")
+            text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)  # Control chars
+            text = text.encode('ascii', 'ignore').decode('ascii')  # Unicode
+            text = re.sub(r'\s+', ' ', text)  # Whitespace
+            return text.strip()
     def fetch_content(self, url: str) -> Optional[Dict]:
+        """Universal content fetcher with special case handling"""
+        # Google Drive document handling
         if 'drive.google.com' in url:
+            return self._handle_google_drive(url)
+        # Google Calendar ICS handling
+        if 'calendar.google.com' in url and 'ical' in url:
+            return self._handle_google_calendar(url)
+        # Standard HTML processing
+        return self._fetch_html_content(url)
+    def _handle_google_drive(self, url: str) -> Optional[Dict]:
+        """Process Google Drive file links"""
+        try:
             file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
+            if not file_id:
                 logger.error(f"Invalid Google Drive URL: {url}")
                 return None
+            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
+            response = self.session.get(direct_url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
                 'content_type': response.headers.get('Content-Type', ''),
                 'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
+            logger.error(f"Google Drive processing failed: {e}")
             return None
+    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
+        """Process Google Calendar ICS feeds"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            response.raise_for_status()
+            return {
+                'content': response.text,
+                'content_type': 'text/calendar',
+                'timestamp': datetime.now().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Calendar fetch failed: {e}")
+            return None
+    def _fetch_html_content(self, url: str) -> Optional[Dict]:
+        """Standard HTML content processing"""
+        try:
+            response = self.session.get(url, timeout=self.timeout)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            # ... existing HTML processing logic ...
+            return structured_data
+        except Exception as e:
+            logger.error(f"HTML processing failed: {e}")
+            return None
 class FileProcessor:
     """Class to handle file processing"""