Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -89,13 +89,21 @@ async def get_links(session, url, base_url):
|
|
89 |
if response.status == 200:
|
90 |
text = await response.text()
|
91 |
soup = BeautifulSoup(text, 'html.parser')
|
92 |
-
links = soup.find_all('a', href=True)
|
93 |
valid_links = []
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
return valid_links
|
100 |
else:
|
101 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
@@ -116,14 +124,15 @@ async def crawl_pages(base_url, max_depth):
|
|
116 |
continue
|
117 |
|
118 |
visited.add(current_url)
|
119 |
-
|
120 |
|
121 |
content = await get_page_content(session, current_url)
|
122 |
all_pages.append((current_url, content))
|
|
|
123 |
|
124 |
if depth < max_depth:
|
125 |
-
|
126 |
-
for link in
|
127 |
if link not in visited and link not in [url for url, _ in to_visit]:
|
128 |
to_visit.append((link, depth + 1))
|
129 |
|
|
|
89 |
if response.status == 200:
|
90 |
text = await response.text()
|
91 |
soup = BeautifulSoup(text, 'html.parser')
|
|
|
92 |
valid_links = []
|
93 |
+
|
94 |
+
# Look for the main content area
|
95 |
+
main_content = soup.find('div', class_='toc')
|
96 |
+
|
97 |
+
if main_content:
|
98 |
+
# Find all link containers
|
99 |
+
link_containers = main_content.find_all('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
|
100 |
+
|
101 |
+
for container in link_containers:
|
102 |
+
for link in container.find_all('a', href=True):
|
103 |
+
full_url = urljoin(base_url, link['href'])
|
104 |
+
if full_url.startswith(base_url) and full_url != url:
|
105 |
+
valid_links.append(full_url)
|
106 |
+
|
107 |
return valid_links
|
108 |
else:
|
109 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
|
|
124 |
continue
|
125 |
|
126 |
visited.add(current_url)
|
127 |
+
start_time = time.time()
|
128 |
|
129 |
content = await get_page_content(session, current_url)
|
130 |
all_pages.append((current_url, content))
|
131 |
+
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
132 |
|
133 |
if depth < max_depth:
|
134 |
+
links = await get_links(session, current_url, base_url)
|
135 |
+
for link in links:
|
136 |
if link not in visited and link not in [url for url, _ in to_visit]:
|
137 |
to_visit.append((link, depth + 1))
|
138 |
|