Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -57,21 +57,30 @@ def crawl_pages(base_url, max_depth):
|
|
57 |
to_visit = [(base_url, 0)]
|
58 |
all_pages = []
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
return all_pages
|
77 |
|
|
|
57 |
to_visit = [(base_url, 0)]
|
58 |
all_pages = []
|
59 |
|
60 |
+
def process_page(url, depth):
|
61 |
+
content = get_page_content(url)
|
62 |
+
logger.info(f"Processed page: {url} at depth {depth}")
|
63 |
+
return url, content, depth
|
64 |
+
|
65 |
+
with ThreadPoolExecutor(max_workers=10) as executor: # Adjust max_workers as needed
|
66 |
+
futures = []
|
67 |
+
while to_visit:
|
68 |
+
current_url, depth = to_visit.pop(0)
|
69 |
+
if current_url in visited or depth > max_depth:
|
70 |
+
continue
|
71 |
+
|
72 |
+
visited.add(current_url)
|
73 |
+
futures.append(executor.submit(process_page, current_url, depth))
|
74 |
+
|
75 |
+
if depth < max_depth:
|
76 |
+
links = get_links(current_url, base_url)
|
77 |
+
for link in links:
|
78 |
+
if link not in visited:
|
79 |
+
to_visit.append((link, depth + 1))
|
80 |
+
|
81 |
+
for future in as_completed(futures):
|
82 |
+
url, content, depth = future.result()
|
83 |
+
all_pages.append((url, content))
|
84 |
|
85 |
return all_pages
|
86 |
|