bluenevus commited on
Commit
40c0a08
·
verified ·
1 Parent(s): 156e042

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -9
app.py CHANGED
@@ -89,13 +89,21 @@ async def get_links(session, url, base_url):
89
  if response.status == 200:
90
  text = await response.text()
91
  soup = BeautifulSoup(text, 'html.parser')
92
- links = soup.find_all('a', href=True)
93
  valid_links = []
94
- for link in links:
95
- href = link['href']
96
- full_url = urljoin(base_url, href)
97
- if full_url.startswith(base_url) and full_url != url:
98
- valid_links.append(full_url)
 
 
 
 
 
 
 
 
 
99
  return valid_links
100
  else:
101
  logger.error(f"Error fetching links from {url}: HTTP {response.status}")
@@ -116,14 +124,15 @@ async def crawl_pages(base_url, max_depth):
116
  continue
117
 
118
  visited.add(current_url)
119
- logger.info(f"Crawling: {current_url} at depth {depth}")
120
 
121
  content = await get_page_content(session, current_url)
122
  all_pages.append((current_url, content))
 
123
 
124
  if depth < max_depth:
125
- new_links = [link.split(' - ')[1] for link in content if link.startswith('LINK:')]
126
- for link in new_links:
127
  if link not in visited and link not in [url for url, _ in to_visit]:
128
  to_visit.append((link, depth + 1))
129
 
 
89
  if response.status == 200:
90
  text = await response.text()
91
  soup = BeautifulSoup(text, 'html.parser')
 
92
  valid_links = []
93
+
94
+ # Look for the main content area
95
+ main_content = soup.find('div', class_='toc')
96
+
97
+ if main_content:
98
+ # Find all link containers
99
+ link_containers = main_content.find_all('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
100
+
101
+ for container in link_containers:
102
+ for link in container.find_all('a', href=True):
103
+ full_url = urljoin(base_url, link['href'])
104
+ if full_url.startswith(base_url) and full_url != url:
105
+ valid_links.append(full_url)
106
+
107
  return valid_links
108
  else:
109
  logger.error(f"Error fetching links from {url}: HTTP {response.status}")
 
124
  continue
125
 
126
  visited.add(current_url)
127
+ start_time = time.time()
128
 
129
  content = await get_page_content(session, current_url)
130
  all_pages.append((current_url, content))
131
+ logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
132
 
133
  if depth < max_depth:
134
+ links = await get_links(session, current_url, base_url)
135
+ for link in links:
136
  if link not in visited and link not in [url for url, _ in to_visit]:
137
  to_visit.append((link, depth + 1))
138