bluenevus commited on
Commit
81f7834
·
verified ·
1 Parent(s): 733d87e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -15
app.py CHANGED
@@ -57,21 +57,30 @@ def crawl_pages(base_url, max_depth):
57
  to_visit = [(base_url, 0)]
58
  all_pages = []
59
 
60
- while to_visit:
61
- current_url, depth = to_visit.pop(0)
62
- if current_url in visited or depth > max_depth:
63
- continue
64
-
65
- visited.add(current_url)
66
- content = get_page_content(current_url)
67
- all_pages.append((current_url, content))
68
- logger.info(f"Processed page: {current_url} at depth {depth}")
69
-
70
- if depth < max_depth:
71
- links = get_links(current_url, base_url)
72
- for link in links:
73
- if link not in visited:
74
- to_visit.append((link, depth + 1))
 
 
 
 
 
 
 
 
 
75
 
76
  return all_pages
77
 
 
57
  to_visit = [(base_url, 0)]
58
  all_pages = []
59
 
60
+ def process_page(url, depth):
61
+ content = get_page_content(url)
62
+ logger.info(f"Processed page: {url} at depth {depth}")
63
+ return url, content, depth
64
+
65
+ with ThreadPoolExecutor(max_workers=10) as executor: # Adjust max_workers as needed
66
+ futures = []
67
+ while to_visit:
68
+ current_url, depth = to_visit.pop(0)
69
+ if current_url in visited or depth > max_depth:
70
+ continue
71
+
72
+ visited.add(current_url)
73
+ futures.append(executor.submit(process_page, current_url, depth))
74
+
75
+ if depth < max_depth:
76
+ links = get_links(current_url, base_url)
77
+ for link in links:
78
+ if link not in visited:
79
+ to_visit.append((link, depth + 1))
80
+
81
+ for future in as_completed(futures):
82
+ url, content, depth = future.result()
83
+ all_pages.append((url, content))
84
 
85
  return all_pages
86