Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,6 @@ import re
|
|
8 |
import logging
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
|
11 |
-
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
14 |
|
@@ -35,7 +34,7 @@ def get_page_content(url):
|
|
35 |
return content
|
36 |
except Exception as e:
|
37 |
logger.error(f"Error processing {url}: {str(e)}")
|
38 |
-
return [
|
39 |
|
40 |
def get_links(url, base_url):
|
41 |
try:
|
@@ -63,7 +62,7 @@ def crawl_pages(base_url, max_depth):
|
|
63 |
logger.info(f"Processed page: {url} at depth {depth}")
|
64 |
return url, content, depth
|
65 |
|
66 |
-
with ThreadPoolExecutor(max_workers=10) as executor:
|
67 |
futures = []
|
68 |
while to_visit:
|
69 |
current_url, depth = to_visit.pop(0)
|
@@ -81,7 +80,8 @@ def crawl_pages(base_url, max_depth):
|
|
81 |
|
82 |
for future in as_completed(futures):
|
83 |
url, content, depth = future.result()
|
84 |
-
|
|
|
85 |
|
86 |
return all_pages
|
87 |
|
@@ -118,16 +118,18 @@ def process_url(url, depth):
|
|
118 |
return pdf_file
|
119 |
except Exception as e:
|
120 |
logger.error(f"Error in process_url: {str(e)}")
|
121 |
-
return
|
122 |
|
123 |
-
# Add this new function
|
124 |
def threaded_process_url(url, depth):
|
125 |
with ThreadPoolExecutor() as executor:
|
126 |
future = executor.submit(process_url, url, depth)
|
127 |
-
|
|
|
|
|
|
|
128 |
|
129 |
iface = gr.Interface(
|
130 |
-
fn=threaded_process_url,
|
131 |
inputs=[
|
132 |
gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
|
133 |
gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
|
@@ -138,4 +140,4 @@ iface = gr.Interface(
|
|
138 |
)
|
139 |
|
140 |
if __name__ == "__main__":
|
141 |
-
iface.launch(
|
|
|
8 |
import logging
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
|
|
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
34 |
return content
|
35 |
except Exception as e:
|
36 |
logger.error(f"Error processing {url}: {str(e)}")
|
37 |
+
return [] # Return an empty list instead of error message
|
38 |
|
39 |
def get_links(url, base_url):
|
40 |
try:
|
|
|
62 |
logger.info(f"Processed page: {url} at depth {depth}")
|
63 |
return url, content, depth
|
64 |
|
65 |
+
with ThreadPoolExecutor(max_workers=10) as executor:
|
66 |
futures = []
|
67 |
while to_visit:
|
68 |
current_url, depth = to_visit.pop(0)
|
|
|
80 |
|
81 |
for future in as_completed(futures):
|
82 |
url, content, depth = future.result()
|
83 |
+
if content: # Only add pages with content
|
84 |
+
all_pages.append((url, content))
|
85 |
|
86 |
return all_pages
|
87 |
|
|
|
118 |
return pdf_file
|
119 |
except Exception as e:
|
120 |
logger.error(f"Error in process_url: {str(e)}")
|
121 |
+
return None # Return None instead of error message
|
122 |
|
|
|
123 |
def threaded_process_url(url, depth):
|
124 |
with ThreadPoolExecutor() as executor:
|
125 |
future = executor.submit(process_url, url, depth)
|
126 |
+
result = future.result()
|
127 |
+
if result is None:
|
128 |
+
return gr.update(value=None, visible=False)
|
129 |
+
return gr.update(value=result, visible=True)
|
130 |
|
131 |
iface = gr.Interface(
|
132 |
+
fn=threaded_process_url,
|
133 |
inputs=[
|
134 |
gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
|
135 |
gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
|
|
|
140 |
)
|
141 |
|
142 |
if __name__ == "__main__":
|
143 |
+
iface.launch()
|