Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 22

Commit

df3f48b

verified ·

1 Parent(s): de72fda

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -6

app.py CHANGED Viewed

@@ -142,21 +142,51 @@ class URLProcessor:
             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
-            # Extract main content
-            main_content = soup.find('main') or soup.find('article') or soup.body
             # Clean and structure content
-            text_content = main_content.get_text(separator='\n', strip=True)
             cleaned_content = self.advanced_text_cleaning(text_content)
             return {
                 'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
-                'timestamp': datetime.now().isoformat()
             }
         except Exception as e:
-            logger.error(f"HTML processing failed: {e}")
             return None

             # Remove unwanted elements
             for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
                 element.decompose()
+            # Remove login walls and overlays common on social media sites
+            for element in soup.select('.login-wall, .signup-wall, .overlay, .modal, [role="dialog"], [aria-modal="true"]'):
+                element.decompose()
+            # Remove specific elements for known sites
+            if 'facebook.com' in url:
+                for element in soup.select('[data-testid="cookie-policy-manage-dialog"], [role="banner"], [role="complementary"]'):
+                    element.decompose()
+            elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
+                for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
+                    element.decompose()
+            # Extract content using a general approach
+            # First try to find main content containers
+            main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda c: c and any(term in c for term in ['content', 'main', 'body', 'post', 'feed', 'timeline']))
+            # If no specific container found, fall back to body
+            if not main_content or not main_content.get_text(strip=True):
+                logger.info(f"No main content container found for {url}, using body")
+                main_content = soup.body if soup.body else soup
+            # Extract text with proper spacing
+            text_content = main_content.get_text(separator='\n', strip=True)
+            # If content is too short, try a more aggressive approach to get all visible text
+            if len(text_content) < 100:
+                logger.info(f"Content too short for {url}, using all visible text")
+                visible_text = []
+                for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
+                    if element.get_text(strip=True):
+                        visible_text.append(element.get_text(strip=True))
+                text_content = '\n'.join(visible_text)
             # Clean and structure content
             cleaned_content = self.advanced_text_cleaning(text_content)
             return {
                 'content': cleaned_content,
                 'content_type': response.headers.get('Content-Type', ''),
+                'timestamp': datetime.now().isoformat(),
+                'url': url  # Add the URL to the returned data for reference
             }
         except Exception as e:
+            logger.error(f"HTML processing failed for {url}: {e}")
             return None