Spaces:

MicroHealth
/

website-to-pdf

Sleeping

bluenevus commited on Apr 14

Commit

81f7834

verified ·

1 Parent(s): 733d87e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -57,21 +57,30 @@ def crawl_pages(base_url, max_depth):
     to_visit = [(base_url, 0)]
     all_pages = []
-    while to_visit:
-        current_url, depth = to_visit.pop(0)
-        if current_url in visited or depth > max_depth:
-            continue
-        visited.add(current_url)
-        content = get_page_content(current_url)
-        all_pages.append((current_url, content))
-        logger.info(f"Processed page: {current_url} at depth {depth}")
-        if depth < max_depth:
-            links = get_links(current_url, base_url)
-            for link in links:
-                if link not in visited:
-                    to_visit.append((link, depth + 1))
     return all_pages

     to_visit = [(base_url, 0)]
     all_pages = []
+    def process_page(url, depth):
+        content = get_page_content(url)
+        logger.info(f"Processed page: {url} at depth {depth}")
+        return url, content, depth
+    with ThreadPoolExecutor(max_workers=10) as executor:  # Adjust max_workers as needed
+        futures = []
+        while to_visit:
+            current_url, depth = to_visit.pop(0)
+            if current_url in visited or depth > max_depth:
+                continue
+            visited.add(current_url)
+            futures.append(executor.submit(process_page, current_url, depth))
+            if depth < max_depth:
+                links = get_links(current_url, base_url)
+                for link in links:
+                    if link not in visited:
+                        to_visit.append((link, depth + 1))
+        for future in as_completed(futures):
+            url, content, depth = future.result()
+            all_pages.append((url, content))
     return all_pages