Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -55,13 +55,13 @@ async def get_page_content(session, url):
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
-
main_content = soup.find('
|
59 |
if main_content:
|
60 |
-
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
61 |
for element in main_content.find_all(tag):
|
62 |
text = clean_text(element.get_text(strip=True))
|
63 |
if text:
|
64 |
-
content.append(text)
|
65 |
logger.info(f"Found {len(content)} content items for {url}")
|
66 |
return content
|
67 |
else:
|
@@ -82,10 +82,7 @@ async def get_links(session, url, base_url):
|
|
82 |
valid_links = []
|
83 |
for link in links:
|
84 |
href = link['href']
|
85 |
-
full_url = urljoin(
|
86 |
-
# Check if the link is relative and doesn't start with '/'
|
87 |
-
if not href.startswith('/') and not href.startswith('http'):
|
88 |
-
full_url = f"{base_url}/{href}"
|
89 |
if full_url.startswith(base_url) and full_url != url:
|
90 |
valid_links.append(full_url)
|
91 |
return valid_links
|
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
+
main_content = soup.find('main') or soup.find('div', class_='content') or soup
|
59 |
if main_content:
|
60 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code']:
|
61 |
for element in main_content.find_all(tag):
|
62 |
text = clean_text(element.get_text(strip=True))
|
63 |
if text:
|
64 |
+
content.append(f"{tag.upper()}: {text}")
|
65 |
logger.info(f"Found {len(content)} content items for {url}")
|
66 |
return content
|
67 |
else:
|
|
|
82 |
valid_links = []
|
83 |
for link in links:
|
84 |
href = link['href']
|
85 |
+
full_url = urljoin(base_url, href)
|
|
|
|
|
|
|
86 |
if full_url.startswith(base_url) and full_url != url:
|
87 |
valid_links.append(full_url)
|
88 |
return valid_links
|