bluenevus commited on
Commit
a8a1bcb
·
verified ·
1 Parent(s): 3edbcab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -7
app.py CHANGED
@@ -55,13 +55,13 @@ async def get_page_content(session, url):
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
- main_content = soup.find('article') or soup.find('main') or soup
59
  if main_content:
60
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
61
  for element in main_content.find_all(tag):
62
  text = clean_text(element.get_text(strip=True))
63
  if text:
64
- content.append(text)
65
  logger.info(f"Found {len(content)} content items for {url}")
66
  return content
67
  else:
@@ -82,10 +82,7 @@ async def get_links(session, url, base_url):
82
  valid_links = []
83
  for link in links:
84
  href = link['href']
85
- full_url = urljoin(url, href)
86
- # Check if the link is relative and doesn't start with '/'
87
- if not href.startswith('/') and not href.startswith('http'):
88
- full_url = f"{base_url}/{href}"
89
  if full_url.startswith(base_url) and full_url != url:
90
  valid_links.append(full_url)
91
  return valid_links
 
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
+ main_content = soup.find('main') or soup.find('div', class_='content') or soup
59
  if main_content:
60
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code']:
61
  for element in main_content.find_all(tag):
62
  text = clean_text(element.get_text(strip=True))
63
  if text:
64
+ content.append(f"{tag.upper()}: {text}")
65
  logger.info(f"Found {len(content)} content items for {url}")
66
  return content
67
  else:
 
82
  valid_links = []
83
  for link in links:
84
  href = link['href']
85
+ full_url = urljoin(base_url, href)
 
 
 
86
  if full_url.startswith(base_url) and full_url != url:
87
  valid_links.append(full_url)
88
  return valid_links