Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -119,9 +119,51 @@ class URLProcessor:
|
|
119 |
if 'calendar.google.com' in url and 'ical' in url:
|
120 |
return self._handle_google_calendar(url)
|
121 |
|
122 |
-
#
|
123 |
result = self._fetch_html_content(url)
|
124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
125 |
# Log the result status
|
126 |
if result:
|
127 |
logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
|
@@ -138,18 +180,42 @@ class URLProcessor:
|
|
138 |
try:
|
139 |
# Try with a different user agent if it's a social media site
|
140 |
if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
|
|
|
141 |
self.session.headers.update({
|
142 |
-
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
143 |
# Add cookie consent headers to bypass some login walls
|
144 |
-
'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
})
|
146 |
# For Facebook, try to access the mobile version which often has fewer restrictions
|
147 |
if 'facebook.com' in url and 'm.facebook.com' not in url:
|
148 |
url = url.replace('www.facebook.com', 'm.facebook.com')
|
149 |
logger.info(f"Switched to mobile Facebook URL: {url}")
|
150 |
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
|
155 |
|
@@ -159,6 +225,11 @@ class URLProcessor:
|
|
159 |
f.write(response.text)
|
160 |
logger.info(f"Saved raw HTML to {debug_path}")
|
161 |
|
|
|
|
|
|
|
|
|
|
|
162 |
soup = BeautifulSoup(response.text, 'html.parser')
|
163 |
|
164 |
# Remove unwanted elements
|
@@ -628,4 +699,66 @@ def main():
|
|
628 |
)
|
629 |
|
630 |
if __name__ == "__main__":
|
631 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
if 'calendar.google.com' in url and 'ical' in url:
|
120 |
return self._handle_google_calendar(url)
|
121 |
|
122 |
+
# Try standard HTML processing first
|
123 |
result = self._fetch_html_content(url)
|
124 |
|
125 |
+
# If standard processing failed or returned minimal content, try with Selenium
|
126 |
+
if not result or len(result.get('content', '')) < 100:
|
127 |
+
logger.info(f"Standard processing failed or returned minimal content for {url}, trying Selenium")
|
128 |
+
selenium_html = self._fetch_with_selenium(url)
|
129 |
+
|
130 |
+
if selenium_html:
|
131 |
+
# Process the Selenium HTML
|
132 |
+
soup = BeautifulSoup(selenium_html, 'html.parser')
|
133 |
+
|
134 |
+
# Remove unwanted elements
|
135 |
+
for element in soup(['script', 'style', 'nav', 'footer', 'header', 'meta', 'link']):
|
136 |
+
element.decompose()
|
137 |
+
|
138 |
+
# Apply the same content extraction strategies as in _fetch_html_content
|
139 |
+
# Strategy 1: Look for semantic HTML5 elements
|
140 |
+
main_content = None
|
141 |
+
for selector in ['main', 'article', 'section', '.content', '.main', '.body', '.post', '.entry', '.page']:
|
142 |
+
elements = soup.select(selector)
|
143 |
+
if elements:
|
144 |
+
main_content = elements[0]
|
145 |
+
logger.info(f"Found content with selector: {selector}")
|
146 |
+
break
|
147 |
+
|
148 |
+
# If no main content found, use body
|
149 |
+
if not main_content or not main_content.get_text(strip=True):
|
150 |
+
main_content = soup.body if soup.body else soup
|
151 |
+
|
152 |
+
# Extract text
|
153 |
+
text_content = main_content.get_text(separator='\n', strip=True)
|
154 |
+
|
155 |
+
# Clean content
|
156 |
+
cleaned_content = self.advanced_text_cleaning(text_content)
|
157 |
+
|
158 |
+
if len(cleaned_content) >= 20:
|
159 |
+
result = {
|
160 |
+
'content': cleaned_content,
|
161 |
+
'content_type': 'text/html',
|
162 |
+
'timestamp': datetime.now().isoformat(),
|
163 |
+
'url': url,
|
164 |
+
'source': 'selenium' # Mark that this came from Selenium
|
165 |
+
}
|
166 |
+
|
167 |
# Log the result status
|
168 |
if result:
|
169 |
logger.info(f"Successfully extracted content from {url} ({len(result.get('content', ''))} chars)")
|
|
|
180 |
try:
|
181 |
# Try with a different user agent if it's a social media site
|
182 |
if any(domain in url for domain in ['facebook.com', 'instagram.com', 'twitter.com', 'x.com', 'huggingface.co']):
|
183 |
+
# Use a more realistic browser user agent instead of random one
|
184 |
self.session.headers.update({
|
185 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
186 |
# Add cookie consent headers to bypass some login walls
|
187 |
+
'Cookie': 'c_user=0; xs=0; datr=0; locale=en_US; wd=1920x1080; consent_accepted=true; cookie_consent=accepted',
|
188 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
189 |
+
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
190 |
+
'sec-ch-ua-mobile': '?0',
|
191 |
+
'sec-ch-ua-platform': '"macOS"',
|
192 |
+
'Sec-Fetch-Dest': 'document',
|
193 |
+
'Sec-Fetch-Mode': 'navigate',
|
194 |
+
'Sec-Fetch-Site': 'none',
|
195 |
+
'Sec-Fetch-User': '?1',
|
196 |
+
'Upgrade-Insecure-Requests': '1'
|
197 |
})
|
198 |
# For Facebook, try to access the mobile version which often has fewer restrictions
|
199 |
if 'facebook.com' in url and 'm.facebook.com' not in url:
|
200 |
url = url.replace('www.facebook.com', 'm.facebook.com')
|
201 |
logger.info(f"Switched to mobile Facebook URL: {url}")
|
202 |
|
203 |
+
# Add a delay to simulate human browsing
|
204 |
+
time.sleep(1)
|
205 |
+
|
206 |
+
# Try to get the page with multiple attempts
|
207 |
+
max_attempts = 3
|
208 |
+
for attempt in range(max_attempts):
|
209 |
+
try:
|
210 |
+
response = self.session.get(url, timeout=self.timeout)
|
211 |
+
response.raise_for_status()
|
212 |
+
break
|
213 |
+
except (requests.exceptions.RequestException, Exception) as e:
|
214 |
+
if attempt < max_attempts - 1:
|
215 |
+
logger.warning(f"Attempt {attempt+1} failed for {url}: {e}. Retrying...")
|
216 |
+
time.sleep(2) # Wait longer between retries
|
217 |
+
else:
|
218 |
+
raise
|
219 |
|
220 |
logger.info(f"Response status: {response.status_code}, Content-Type: {response.headers.get('Content-Type')}")
|
221 |
|
|
|
225 |
f.write(response.text)
|
226 |
logger.info(f"Saved raw HTML to {debug_path}")
|
227 |
|
228 |
+
# Check if we got a valid response with content
|
229 |
+
if not response.text or len(response.text) < 100:
|
230 |
+
logger.error(f"Empty or very short response from {url}")
|
231 |
+
return None
|
232 |
+
|
233 |
soup = BeautifulSoup(response.text, 'html.parser')
|
234 |
|
235 |
# Remove unwanted elements
|
|
|
699 |
)
|
700 |
|
701 |
if __name__ == "__main__":
|
702 |
+
main()
|
703 |
+
|
704 |
+
|
705 |
+
def _fetch_with_selenium(self, url: str) -> Optional[str]:
|
706 |
+
"""Use Selenium as a fallback for difficult sites"""
|
707 |
+
try:
|
708 |
+
from selenium import webdriver
|
709 |
+
from selenium.webdriver.chrome.options import Options
|
710 |
+
from selenium.webdriver.common.by import By
|
711 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
712 |
+
from selenium.webdriver.support import expected_conditions as EC
|
713 |
+
from selenium.common.exceptions import TimeoutException
|
714 |
+
import time
|
715 |
+
|
716 |
+
logger.info(f"Attempting to fetch {url} with Selenium")
|
717 |
+
|
718 |
+
# Set up Chrome options
|
719 |
+
chrome_options = Options()
|
720 |
+
chrome_options.add_argument("--headless")
|
721 |
+
chrome_options.add_argument("--no-sandbox")
|
722 |
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
723 |
+
chrome_options.add_argument("--disable-gpu")
|
724 |
+
chrome_options.add_argument("--window-size=1920,1080")
|
725 |
+
chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
726 |
+
|
727 |
+
# Initialize the driver
|
728 |
+
driver = webdriver.Chrome(options=chrome_options)
|
729 |
+
|
730 |
+
try:
|
731 |
+
# Navigate to the URL
|
732 |
+
driver.get(url)
|
733 |
+
|
734 |
+
# Wait for the page to load
|
735 |
+
WebDriverWait(driver, 10).until(
|
736 |
+
EC.presence_of_element_located((By.TAG_NAME, "body"))
|
737 |
+
)
|
738 |
+
|
739 |
+
# Simulate pressing ESC key to dismiss overlays
|
740 |
+
from selenium.webdriver.common.keys import Keys
|
741 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
742 |
+
|
743 |
+
# Wait a bit for any animations to complete
|
744 |
+
time.sleep(2)
|
745 |
+
|
746 |
+
# Get the page source
|
747 |
+
page_source = driver.page_source
|
748 |
+
|
749 |
+
# Save the Selenium HTML for debugging
|
750 |
+
debug_path = f"/Users/a2014/urld/debug_selenium_{int(time.time())}.html"
|
751 |
+
with open(debug_path, "w", encoding="utf-8") as f:
|
752 |
+
f.write(page_source)
|
753 |
+
logger.info(f"Saved Selenium HTML to {debug_path}")
|
754 |
+
|
755 |
+
return page_source
|
756 |
+
finally:
|
757 |
+
driver.quit()
|
758 |
+
|
759 |
+
except ImportError:
|
760 |
+
logger.error("Selenium is not installed. Cannot use browser automation.")
|
761 |
+
return None
|
762 |
+
except Exception as e:
|
763 |
+
logger.error(f"Selenium processing failed for {url}: {e}")
|
764 |
+
return None
|