Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

6bdfb9b

verified ·

1 Parent(s): df3f48b

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -48

app.py CHANGED Viewed

@@ -73,15 +73,28 @@ class URLProcessor:
             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
-            response = self.session.head(url, timeout=self.timeout)
-            response.raise_for_status()
             return {'is_valid': True, 'message': 'URL is valid and accessible'}
         except Exception as e:
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
@@ -91,52 +104,37 @@ class URLProcessor:
                 return self._handle_google_calendar(url)
             # Standard HTML processing
-            return self._fetch_html_content(url)
-        except Exception as e:
-            logger.error(f"Content fetch failed: {e}")
-            return None
-    def _handle_google_drive(self, url: str) -> Optional[Dict]:
-        """Process Google Drive file links"""
-        try:
-            file_id = re.search(r'/file/d/([a-zA-Z0-9_-]+)', url)
-            if not file_id:
-                logger.error(f"Invalid Google Drive URL: {url}")
-                return None
-            direct_url = f"https://drive.google.com/uc?export=download&id={file_id.group(1)}"
-            response = self.session.get(direct_url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
-            }
-        except Exception as e:
-            logger.error(f"Google Drive processing failed: {e}")
-            return None
-    def _handle_google_calendar(self, url: str) -> Optional[Dict]:
-        """Process Google Calendar ICS feeds"""
-        try:
-            response = self.session.get(url, timeout=self.timeout)
-            response.raise_for_status()
-            return {
-                'content': response.text,
-                'content_type': 'text/calendar',
-                'timestamp': datetime.now().isoformat()
-            }
         except Exception as e:
-            logger.error(f"Calendar fetch failed: {e}")
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
         """Standard HTML content processing"""
         try:
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
@@ -154,12 +152,42 @@ class URLProcessor:
             elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
                 for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
                     element.decompose()
-            # Extract content using a general approach
-            # First try to find main content containers
-            main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda c: c and any(term in c for term in ['content', 'main', 'body', 'post', 'feed', 'timeline']))
-            # If no specific container found, fall back to body
             if not main_content or not main_content.get_text(strip=True):
                 logger.info(f"No main content container found for {url}, using body")
                 main_content = soup.body if soup.body else soup
@@ -167,17 +195,29 @@ class URLProcessor:
             # Extract text with proper spacing
             text_content = main_content.get_text(separator='\n', strip=True)
-            # If content is too short, try a more aggressive approach to get all visible text
             if len(text_content) < 100:
-                logger.info(f"Content too short for {url}, using all visible text")
                 visible_text = []
                 for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
                     if element.get_text(strip=True):
                         visible_text.append(element.get_text(strip=True))
                 text_content = '\n'.join(visible_text)
             # Clean and structure content
             cleaned_content = self.advanced_text_cleaning(text_content)
             return {
                 'content': cleaned_content,
@@ -355,7 +395,7 @@ def generate_qr(json_data):
         qr.add_data(json_data)
         qr.make(fit=True)
-        img = qr.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name
@@ -373,7 +413,7 @@ def generate_qr(json_data):
         qr.add_data("Error: Data too large for QR code")
         qr.make(fit=True)
-        img = qr.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name

             if not validators.url(url):
                 return {'is_valid': False, 'message': 'Invalid URL format'}
+            # Some sites block HEAD requests but allow GET
+            try:
+                response = self.session.head(url, timeout=self.timeout)
+                response.raise_for_status()
+            except (requests.exceptions.RequestException, Exception) as e:
+                logger.warning(f"HEAD request failed for {url}, trying GET: {e}")
+                # Try with GET request if HEAD fails
+                response = self.session.get(url, timeout=self.timeout, stream=True)
+                response.raise_for_status()
+                # Close the connection to avoid downloading the entire content
+                response.close()
             return {'is_valid': True, 'message': 'URL is valid and accessible'}
         except Exception as e:
+            logger.error(f"URL validation failed for {url}: {str(e)}")
             return {'is_valid': False, 'message': f'URL validation failed: {str(e)}'}
     def fetch_content(self, url: str) -> Optional[Dict]:
         """Universal content fetcher with special case handling"""
         try:
+            logger.info(f"Fetching content from: {url}")
             # Google Drive document handling
             if 'drive.google.com' in url:
                 return self._handle_google_drive(url)
                 return self._handle_google_calendar(url)
             # Standard HTML processing
+            result = self._fetch_html_content(url)
+            # Log the result status
+            if result:
+                logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
+            else:
+                logger.error(f"Failed to extract content from {url}")
+            return result
         except Exception as e:
+            logger.error(f"Content fetch failed for {url}: {e}")
             return None
     def _fetch_html_content(self, url: str) -> Optional[Dict]:
         """Standard HTML content processing"""
         try:
+            # Try with a different user agent if it's a social media site
+            if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
+                self.session.headers.update({
+                    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
+                })
             response = self.session.get(url, timeout=self.timeout)
             response.raise_for_status()
+            logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
+            # Save the raw HTML for debugging if needed
+            with open(f"debug_raw_{int(time.time())}.html", "w", encoding="utf-8") as f:
+                f.write(response.text)
             soup = BeautifulSoup(response.text, 'html.parser')
             # Remove unwanted elements
             elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
                 for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
                     element.decompose()
+            elif 'huggingface.co' in url:
+                # Special handling for Hugging Face
+                logger.info("Applying special handling for Hugging Face")
+                # Try to find the main content
+                hf_selectors = ['.prose', '.space-content', '.model-description', '.dataset-description', 'article', '.markdown']
+                for selector in hf_selectors:
+                    elements = soup.select(selector)
+                    if elements:
+                        logger.info(f"Found Hugging Face content with selector: {selector}")
+                        break
+            # Extract content using a general approach - try multiple strategies
+            # Strategy 1: Look for semantic HTML5 elements
+            main_content = None
+            for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
+                elements = soup.select(selector)
+                if elements:
+                    main_content = elements[0]
+                    logger.info(f"Found content with selector: {selector}")
+                    break
+            # Strategy 2: If no semantic elements, try common class names
+            if not main_content or not main_content.get_text(strip=True):
+                for div in soup.find_all('div'):
+                    class_name = div.get('class', [])
+                    id_name = div.get('id', '')
+                    if any(term in ' '.join(class_name).lower() for term in ['content', 'main', 'body', 'article', 'post']):
+                        main_content = div
+                        logger.info(f"Found content with div class: {class_name}")
+                        break
+                    if any(term in id_name.lower() for term in ['content', 'main', 'body', 'article', 'post']):
+                        main_content = div
+                        logger.info(f"Found content with div id: {id_name}")
+                        break
+            # Strategy 3: Fall back to body
             if not main_content or not main_content.get_text(strip=True):
                 logger.info(f"No main content container found for {url}, using body")
                 main_content = soup.body if soup.body else soup
             # Extract text with proper spacing
             text_content = main_content.get_text(separator='\n', strip=True)
+            # Strategy 4: If content is too short, extract all visible text
             if len(text_content) < 100:
+                logger.info(f"Content too short for {url} ({len(text_content)} chars), using all visible text")
                 visible_text = []
                 for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
                     if element.get_text(strip=True):
                         visible_text.append(element.get_text(strip=True))
                 text_content = '\n'.join(visible_text)
+            # Strategy 5: Last resort - get all text from the page
+            if len(text_content) < 50:
+                logger.info(f"Still insufficient content for {url} ({len(text_content)} chars), using entire page text")
+                text_content = soup.get_text(separator='\n', strip=True)
             # Clean and structure content
             cleaned_content = self.advanced_text_cleaning(text_content)
+            logger.info(f"Final content length: {len(cleaned_content)} chars")
+            # If we still have no content, this is a failure
+            if len(cleaned_content) < 20:
+                logger.error(f"Failed to extract meaningful content from {url}")
+                return None
             return {
                 'content': cleaned_content,
         qr.add_data(json_data)
         qr.make(fit=True)
+        img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name
         qr.add_data("Error: Data too large for QR code")
         qr.make(fit=True)
+        img = qrcode.make_image(fill_color="black", back_color="white")
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
         img.save(temp_file.name)
         return temp_file.name