Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -35,13 +35,18 @@ def get_page_content(url):
|
|
35 |
logger.error(f"Error processing {url}: {str(e)}")
|
36 |
return [f"Error processing {url}: {str(e)}"]
|
37 |
|
38 |
-
def get_links(url):
|
39 |
try:
|
40 |
response = requests.get(url, timeout=10)
|
41 |
response.raise_for_status()
|
42 |
soup = BeautifulSoup(response.text, 'html.parser')
|
43 |
links = soup.find_all('a', href=True)
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
45 |
except Exception as e:
|
46 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
47 |
return []
|
@@ -59,10 +64,13 @@ def crawl_pages(base_url, max_depth):
|
|
59 |
visited.add(current_url)
|
60 |
content = get_page_content(current_url)
|
61 |
all_pages.append((current_url, content))
|
|
|
62 |
|
63 |
if depth < max_depth:
|
64 |
-
links = get_links(current_url)
|
65 |
-
|
|
|
|
|
66 |
|
67 |
return all_pages
|
68 |
|
|
|
35 |
logger.error(f"Error processing {url}: {str(e)}")
|
36 |
return [f"Error processing {url}: {str(e)}"]
|
37 |
|
38 |
+
def get_links(url, base_url):
|
39 |
try:
|
40 |
response = requests.get(url, timeout=10)
|
41 |
response.raise_for_status()
|
42 |
soup = BeautifulSoup(response.text, 'html.parser')
|
43 |
links = soup.find_all('a', href=True)
|
44 |
+
valid_links = []
|
45 |
+
for link in links:
|
46 |
+
full_url = urljoin(url, link['href'])
|
47 |
+
if full_url.startswith(base_url) and full_url != url:
|
48 |
+
valid_links.append(full_url)
|
49 |
+
return valid_links
|
50 |
except Exception as e:
|
51 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
52 |
return []
|
|
|
64 |
visited.add(current_url)
|
65 |
content = get_page_content(current_url)
|
66 |
all_pages.append((current_url, content))
|
67 |
+
logger.info(f"Processed page: {current_url} at depth {depth}")
|
68 |
|
69 |
if depth < max_depth:
|
70 |
+
links = get_links(current_url, base_url)
|
71 |
+
for link in links:
|
72 |
+
if link not in visited:
|
73 |
+
to_visit.append((link, depth + 1))
|
74 |
|
75 |
return all_pages
|
76 |
|