acecalisto3 commited on
Commit
df3f48b
·
verified ·
1 Parent(s): de72fda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -6
app.py CHANGED
@@ -142,21 +142,51 @@ class URLProcessor:
142
  # Remove unwanted elements
143
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
144
  element.decompose()
145
-
146
- # Extract main content
147
- main_content = soup.find('main') or soup.find('article') or soup.body
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  # Clean and structure content
150
- text_content = main_content.get_text(separator='\n', strip=True)
151
  cleaned_content = self.advanced_text_cleaning(text_content)
152
 
153
  return {
154
  'content': cleaned_content,
155
  'content_type': response.headers.get('Content-Type', ''),
156
- 'timestamp': datetime.now().isoformat()
 
157
  }
158
  except Exception as e:
159
- logger.error(f"HTML processing failed: {e}")
160
  return None
161
 
162
 
 
142
  # Remove unwanted elements
143
  for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
144
  element.decompose()
145
+
146
+ # Remove login walls and overlays common on social media sites
147
+ for element in soup.select('.login-wall, .signup-wall, .overlay, .modal, [role="dialog"], [aria-modal="true"]'):
148
+ element.decompose()
149
+
150
+ # Remove specific elements for known sites
151
+ if 'facebook.com' in url:
152
+ for element in soup.select('[data-testid="cookie-policy-manage-dialog"], [role="banner"], [role="complementary"]'):
153
+ element.decompose()
154
+ elif 'instagram.com' in url or 'twitter.com' in url or 'x.com' in url:
155
+ for element in soup.select('[role="presentation"], [role="banner"], [role="complementary"]'):
156
+ element.decompose()
157
+
158
+ # Extract content using a general approach
159
+ # First try to find main content containers
160
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=lambda c: c and any(term in c for term in ['content', 'main', 'body', 'post', 'feed', 'timeline']))
161
+
162
+ # If no specific container found, fall back to body
163
+ if not main_content or not main_content.get_text(strip=True):
164
+ logger.info(f"No main content container found for {url}, using body")
165
+ main_content = soup.body if soup.body else soup
166
+
167
+ # Extract text with proper spacing
168
+ text_content = main_content.get_text(separator='\n', strip=True)
169
+
170
+ # If content is too short, try a more aggressive approach to get all visible text
171
+ if len(text_content) < 100:
172
+ logger.info(f"Content too short for {url}, using all visible text")
173
+ visible_text = []
174
+ for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'span', 'div']):
175
+ if element.get_text(strip=True):
176
+ visible_text.append(element.get_text(strip=True))
177
+ text_content = '\n'.join(visible_text)
178
 
179
  # Clean and structure content
 
180
  cleaned_content = self.advanced_text_cleaning(text_content)
181
 
182
  return {
183
  'content': cleaned_content,
184
  'content_type': response.headers.get('Content-Type', ''),
185
+ 'timestamp': datetime.now().isoformat(),
186
+ 'url': url # Add the URL to the returned data for reference
187
  }
188
  except Exception as e:
189
+ logger.error(f"HTML processing failed for {url}: {e}")
190
  return None
191
 
192