Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -142,21 +142,51 @@ class URLProcessor:
|
|
142 |
# Remove unwanted elements
|
143 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
144 |
element.decompose()
|
145 |
-
|
146 |
-
#
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
# Clean and structure content
|
150 |
-
text_content = main_content.get_text(separator='\n', strip=True)
|
151 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
152 |
|
153 |
return {
|
154 |
'content': cleaned_content,
|
155 |
'content_type': response.headers.get('Content-Type', ''),
|
156 |
-
'timestamp': datetime.now().isoformat()
|
|
|
157 |
}
|
158 |
except Exception as e:
|
159 |
-
logger.error(f"HTML processing failed: {e}")
|
160 |
return None
|
161 |
|
162 |
|
|
|
142 |
# Remove unwanted elements
|
143 |
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
144 |
element.decompose()
|
145 |
+
|
146 |
+
# Remove login walls and overlays common on social media sites
|
147 |
+
for element in soup.select('.login-wall, .signup-wall, .overlay, .modal, [role="dialog"], [aria-modal="true"]'):
|
148 |
+
element.decompose()
|
149 |
+
|
150 |
+
# Remove specific elements for known sites
|
151 |
+
if 'facebook.com' in url:
|
152 |
+
for element in soup.select('[data-testid="cookie-policy-manage-dialog"], [role="banner"], [role="complementary"]'):
|
153 |
+
element.decompose()
|
154 |
+
elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
|
155 |
+
for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
|
156 |
+
element.decompose()
|
157 |
+
|
158 |
+
# Extract content using a general approach
|
159 |
+
# First try to find main content containers
|
160 |
+
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda c: c and any(term in c for term in ['content', 'main', 'body', 'post', 'feed', 'timeline']))
|
161 |
+
|
162 |
+
# If no specific container found, fall back to body
|
163 |
+
if not main_content or not main_content.get_text(strip=True):
|
164 |
+
logger.info(f"No main content container found for {url}, using body")
|
165 |
+
main_content = soup.body if soup.body else soup
|
166 |
+
|
167 |
+
# Extract text with proper spacing
|
168 |
+
text_content = main_content.get_text(separator='\n', strip=True)
|
169 |
+
|
170 |
+
# If content is too short, try a more aggressive approach to get all visible text
|
171 |
+
if len(text_content) < 100:
|
172 |
+
logger.info(f"Content too short for {url}, using all visible text")
|
173 |
+
visible_text = []
|
174 |
+
for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
|
175 |
+
if element.get_text(strip=True):
|
176 |
+
visible_text.append(element.get_text(strip=True))
|
177 |
+
text_content = '\n'.join(visible_text)
|
178 |
|
179 |
# Clean and structure content
|
|
|
180 |
cleaned_content = self.advanced_text_cleaning(text_content)
|
181 |
|
182 |
return {
|
183 |
'content': cleaned_content,
|
184 |
'content_type': response.headers.get('Content-Type', ''),
|
185 |
+
'timestamp': datetime.now().isoformat(),
|
186 |
+
'url': url # Add the URL to the returned data for reference
|
187 |
}
|
188 |
except Exception as e:
|
189 |
+
logger.error(f"HTML processing failed for {url}: {e}")
|
190 |
return None
|
191 |
|
192 |
|