bluenevus commited on
Commit
ead5062
·
verified ·
1 Parent(s): 40c0a08

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -23
app.py CHANGED
@@ -57,21 +57,14 @@ async def get_page_content(session, url):
57
  content = []
58
 
59
  # Look for the main content area
60
- main_content = soup.find('div', class_='toc')
61
 
62
  if main_content:
63
- # Extract section titles and links
64
- for section in main_content.find_all('div', class_='toc--section'):
65
- title = section.find('h2', class_='toc-title-border')
66
- if title:
67
- content.append(f"H2: {title.text.strip()}")
68
-
69
- links_store = section.find('div', id=lambda x: x and x.startswith("{'type': 'links-store'"))
70
- if links_store:
71
- links = links_store.find_next('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
72
- if links:
73
- for link in links.find_all('a'):
74
- content.append(f"LINK: {link.text.strip()} - {urljoin(url, link['href'])}")
75
 
76
  logger.info(f"Found {len(content)} content items for {url}")
77
  return content
@@ -81,7 +74,7 @@ async def get_page_content(session, url):
81
  except Exception as e:
82
  logger.error(f"Error processing {url}: {str(e)}")
83
  return [f"Error processing {url}: {str(e)}"]
84
-
85
  async def get_links(session, url, base_url):
86
  try:
87
  async with rate_limiter:
@@ -92,17 +85,14 @@ async def get_links(session, url, base_url):
92
  valid_links = []
93
 
94
  # Look for the main content area
95
- main_content = soup.find('div', class_='toc')
96
 
97
  if main_content:
98
- # Find all link containers
99
- link_containers = main_content.find_all('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
100
-
101
- for container in link_containers:
102
- for link in container.find_all('a', href=True):
103
- full_url = urljoin(base_url, link['href'])
104
- if full_url.startswith(base_url) and full_url != url:
105
- valid_links.append(full_url)
106
 
107
  return valid_links
108
  else:
 
57
  content = []
58
 
59
  # Look for the main content area
60
+ main_content = soup.find('div', id='react-entry-point')
61
 
62
  if main_content:
63
+ # Extract all text content
64
+ for tag in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']):
65
+ text = clean_text(tag.get_text(strip=True))
66
+ if text:
67
+ content.append(f"{tag.name.upper()}: {text}")
 
 
 
 
 
 
 
68
 
69
  logger.info(f"Found {len(content)} content items for {url}")
70
  return content
 
74
  except Exception as e:
75
  logger.error(f"Error processing {url}: {str(e)}")
76
  return [f"Error processing {url}: {str(e)}"]
77
+
78
  async def get_links(session, url, base_url):
79
  try:
80
  async with rate_limiter:
 
85
  valid_links = []
86
 
87
  # Look for the main content area
88
+ main_content = soup.find('div', id='react-entry-point')
89
 
90
  if main_content:
91
+ for link in main_content.find_all('a', href=True):
92
+ href = link['href']
93
+ full_url = urljoin(base_url, href)
94
+ if full_url.startswith(base_url) and full_url != url:
95
+ valid_links.append(full_url)
 
 
 
96
 
97
  return valid_links
98
  else: