Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,17 +19,16 @@ import time
|
|
19 |
import os
|
20 |
import ssl
|
21 |
from io import BytesIO
|
|
|
|
|
22 |
from concurrent.futures import ThreadPoolExecutor
|
23 |
-
import math
|
24 |
-
from PyPDF2 import PdfMerger
|
25 |
-
|
26 |
|
27 |
# Initialize Dash app
|
28 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
29 |
server = app.server
|
30 |
|
31 |
# Logging setup
|
32 |
-
logging.basicConfig(level=logging.INFO
|
33 |
logger = logging.getLogger(__name__)
|
34 |
|
35 |
# Thread-local storage for database connections
|
@@ -43,6 +42,9 @@ ssl_context = ssl.create_default_context()
|
|
43 |
ssl_context.check_hostname = False
|
44 |
ssl_context.verify_mode = ssl.CERT_NONE
|
45 |
|
|
|
|
|
|
|
46 |
@contextmanager
|
47 |
def get_db_connection():
|
48 |
if not hasattr(thread_local, "connection"):
|
@@ -112,7 +114,7 @@ async def get_links(session, url, base_url):
|
|
112 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
113 |
return []
|
114 |
|
115 |
-
async def crawl_pages(base_url, max_depth
|
116 |
visited = set()
|
117 |
to_visit = [(base_url, 0)]
|
118 |
all_pages = []
|
@@ -144,9 +146,6 @@ async def crawl_pages(base_url, max_depth, progress_callback):
|
|
144 |
all_pages.append((current_url, content))
|
145 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
146 |
|
147 |
-
progress = len(all_pages) / (max_depth * 10) # Rough estimate
|
148 |
-
progress_callback(f"Crawling pages... {progress:.0%}")
|
149 |
-
|
150 |
if depth < max_depth:
|
151 |
links = await get_links(session, current_url, base_url)
|
152 |
for link in links:
|
@@ -158,70 +157,63 @@ async def crawl_pages(base_url, max_depth, progress_callback):
|
|
158 |
|
159 |
return all_pages
|
160 |
|
161 |
-
def
|
162 |
pdf = FPDF()
|
163 |
pdf.set_auto_page_break(auto=True, margin=15)
|
164 |
pdf.add_page()
|
165 |
pdf.set_font("Arial", size=12)
|
166 |
|
167 |
-
for
|
168 |
-
pdf.cell(0, 10, txt=
|
169 |
pdf.ln(5)
|
170 |
for text in content:
|
171 |
try:
|
172 |
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
173 |
except Exception as e:
|
174 |
logger.error(f"Error writing text to PDF: {str(e)}")
|
175 |
-
pdf.
|
|
|
176 |
|
177 |
-
|
178 |
|
179 |
-
|
180 |
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
181 |
|
182 |
chunk_size = 100
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
with
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
for chunk in pdf_chunks:
|
207 |
-
merger.append(BytesIO(chunk))
|
208 |
-
|
209 |
-
output = BytesIO()
|
210 |
-
merger.write(output)
|
211 |
-
merger.close()
|
212 |
-
|
213 |
-
return output.getvalue()
|
214 |
|
215 |
async def process_url(url, depth, progress_callback):
|
216 |
try:
|
217 |
-
all_pages = await
|
218 |
if not all_pages:
|
219 |
return "No pages were successfully crawled. Please check the URL and try again."
|
220 |
-
|
|
|
|
|
|
|
221 |
return pdf_content
|
222 |
-
except asyncio.TimeoutError:
|
223 |
-
logger.error("Process timed out after 1 hour")
|
224 |
-
return "The process timed out after 1 hour. Please try again with a smaller depth or a more specific URL."
|
225 |
except Exception as e:
|
226 |
logger.error(f"Error in process_url: {str(e)}")
|
227 |
return f"An error occurred: {str(e)}"
|
@@ -255,7 +247,7 @@ app.layout = dbc.Container([
|
|
255 |
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
256 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
257 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
258 |
-
dbc.Progress(id="progress-bar",
|
259 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
260 |
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
261 |
]),
|
@@ -266,16 +258,14 @@ app.layout = dbc.Container([
|
|
266 |
@app.callback(
|
267 |
Output("output-area", "children"),
|
268 |
Output("progress-interval", "disabled"),
|
269 |
-
Output("progress-bar", "
|
270 |
-
Output("progress-bar", "label"),
|
271 |
Input("submit-button", "n_clicks"),
|
272 |
Input("progress-interval", "n_intervals"),
|
273 |
State("url-input", "value"),
|
274 |
State("depth-slider", "value"),
|
275 |
-
State("progress-store", "data"),
|
276 |
prevent_initial_call=True
|
277 |
)
|
278 |
-
def update_output(n_clicks, n_intervals, url, depth
|
279 |
ctx = dash.callback_context
|
280 |
if not ctx.triggered:
|
281 |
raise PreventUpdate
|
@@ -284,22 +274,29 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
|
|
284 |
|
285 |
if triggered_id == "submit-button":
|
286 |
if not url:
|
287 |
-
return "Please enter a valid URL.", True,
|
288 |
|
289 |
-
|
|
|
|
|
|
|
|
|
290 |
|
291 |
elif triggered_id == "progress-interval":
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
|
|
|
|
|
|
301 |
try:
|
302 |
-
encoded = base64.b64encode(
|
303 |
return html.Div([
|
304 |
html.H4("PDF Generated Successfully"),
|
305 |
html.A(
|
@@ -307,36 +304,38 @@ def update_output(n_clicks, n_intervals, url, depth, progress):
|
|
307 |
href=f"data:application/pdf;base64,{encoded}",
|
308 |
download="website_content.pdf"
|
309 |
)
|
310 |
-
]), True,
|
311 |
except Exception as e:
|
312 |
logger.error(f"Error creating download link: {str(e)}")
|
313 |
-
return f"An error occurred while creating the download link: {str(e)}", True,
|
314 |
|
315 |
raise PreventUpdate
|
316 |
|
317 |
@app.callback(
|
318 |
-
Output('pdf-store', 'data'),
|
319 |
Output('progress-store', 'data'),
|
320 |
-
Input('
|
321 |
-
State('url-input', 'value'),
|
322 |
-
State('depth-slider', 'value'),
|
323 |
prevent_initial_call=True
|
324 |
)
|
325 |
-
def
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
|
|
|
331 |
def progress_callback(message):
|
332 |
-
|
333 |
-
|
334 |
-
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
335 |
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
|
|
|
|
|
|
|
|
|
|
340 |
|
341 |
if __name__ == '__main__':
|
342 |
print("Starting the Dash application...")
|
|
|
19 |
import os
|
20 |
import ssl
|
21 |
from io import BytesIO
|
22 |
+
import tempfile
|
23 |
+
import uuid
|
24 |
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
|
|
|
25 |
|
26 |
# Initialize Dash app
|
27 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
28 |
server = app.server
|
29 |
|
30 |
# Logging setup
|
31 |
+
logging.basicConfig(level=logging.INFO)
|
32 |
logger = logging.getLogger(__name__)
|
33 |
|
34 |
# Thread-local storage for database connections
|
|
|
42 |
ssl_context.check_hostname = False
|
43 |
ssl_context.verify_mode = ssl.CERT_NONE
|
44 |
|
45 |
+
# ThreadPoolExecutor for background tasks
|
46 |
+
executor = ThreadPoolExecutor(max_workers=4)
|
47 |
+
|
48 |
@contextmanager
|
49 |
def get_db_connection():
|
50 |
if not hasattr(thread_local, "connection"):
|
|
|
114 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
115 |
return []
|
116 |
|
117 |
+
async def crawl_pages(base_url, max_depth):
|
118 |
visited = set()
|
119 |
to_visit = [(base_url, 0)]
|
120 |
all_pages = []
|
|
|
146 |
all_pages.append((current_url, content))
|
147 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
148 |
|
|
|
|
|
|
|
149 |
if depth < max_depth:
|
150 |
links = await get_links(session, current_url, base_url)
|
151 |
for link in links:
|
|
|
157 |
|
158 |
return all_pages
|
159 |
|
160 |
+
def generate_pdf_chunk(chunk, output_file):
|
161 |
pdf = FPDF()
|
162 |
pdf.set_auto_page_break(auto=True, margin=15)
|
163 |
pdf.add_page()
|
164 |
pdf.set_font("Arial", size=12)
|
165 |
|
166 |
+
for page_url, content in chunk:
|
167 |
+
pdf.cell(0, 10, txt=page_url, ln=True)
|
168 |
pdf.ln(5)
|
169 |
for text in content:
|
170 |
try:
|
171 |
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
172 |
except Exception as e:
|
173 |
logger.error(f"Error writing text to PDF: {str(e)}")
|
174 |
+
if pdf.get_y() > 250: # Add a new page if the current page is almost full
|
175 |
+
pdf.add_page()
|
176 |
|
177 |
+
pdf.output(output_file)
|
178 |
|
179 |
+
def website_to_pdf(all_pages, progress_callback):
|
180 |
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
181 |
|
182 |
chunk_size = 100
|
183 |
+
total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
|
184 |
+
temp_files = []
|
185 |
+
|
186 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
187 |
+
for i in range(0, len(all_pages), chunk_size):
|
188 |
+
chunk = all_pages[i:i+chunk_size]
|
189 |
+
temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
|
190 |
+
generate_pdf_chunk(chunk, temp_file)
|
191 |
+
temp_files.append(temp_file)
|
192 |
+
|
193 |
+
progress = min((i + chunk_size) / len(all_pages), 1.0)
|
194 |
+
progress_callback(f"Processing pages... {progress:.0%}")
|
195 |
+
|
196 |
+
# Merge PDF chunks
|
197 |
+
output_pdf = os.path.join(temp_dir, "final.pdf")
|
198 |
+
merger = PdfMerger()
|
199 |
+
for temp_file in temp_files:
|
200 |
+
merger.append(temp_file)
|
201 |
+
merger.write(output_pdf)
|
202 |
+
merger.close()
|
203 |
+
|
204 |
+
with open(output_pdf, 'rb') as f:
|
205 |
+
return f.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
206 |
|
207 |
async def process_url(url, depth, progress_callback):
|
208 |
try:
|
209 |
+
all_pages = await crawl_pages(url, depth)
|
210 |
if not all_pages:
|
211 |
return "No pages were successfully crawled. Please check the URL and try again."
|
212 |
+
|
213 |
+
# Use ThreadPoolExecutor to run PDF generation in a separate thread
|
214 |
+
loop = asyncio.get_event_loop()
|
215 |
+
pdf_content = await loop.run_in_executor(executor, website_to_pdf, all_pages, progress_callback)
|
216 |
return pdf_content
|
|
|
|
|
|
|
217 |
except Exception as e:
|
218 |
logger.error(f"Error in process_url: {str(e)}")
|
219 |
return f"An error occurred: {str(e)}"
|
|
|
247 |
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
248 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
249 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
250 |
+
dbc.Progress(id="progress-bar", style={"visibility": "hidden"}),
|
251 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
252 |
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
253 |
]),
|
|
|
258 |
@app.callback(
|
259 |
Output("output-area", "children"),
|
260 |
Output("progress-interval", "disabled"),
|
261 |
+
Output("progress-bar", "style"),
|
|
|
262 |
Input("submit-button", "n_clicks"),
|
263 |
Input("progress-interval", "n_intervals"),
|
264 |
State("url-input", "value"),
|
265 |
State("depth-slider", "value"),
|
|
|
266 |
prevent_initial_call=True
|
267 |
)
|
268 |
+
def update_output(n_clicks, n_intervals, url, depth):
|
269 |
ctx = dash.callback_context
|
270 |
if not ctx.triggered:
|
271 |
raise PreventUpdate
|
|
|
274 |
|
275 |
if triggered_id == "submit-button":
|
276 |
if not url:
|
277 |
+
return "Please enter a valid URL.", True, {"visibility": "hidden"}
|
278 |
|
279 |
+
# Start the background task
|
280 |
+
task_id = str(uuid.uuid4())
|
281 |
+
executor.submit(background_task, url, depth, task_id)
|
282 |
+
|
283 |
+
return "Processing... Please wait.", False, {"visibility": "visible"}
|
284 |
|
285 |
elif triggered_id == "progress-interval":
|
286 |
+
# Check progress
|
287 |
+
progress = dash.callback_context.inputs['progress-store.data']
|
288 |
+
if progress is None:
|
289 |
+
return "Processing... Please wait.", False, {"visibility": "visible"}
|
290 |
+
|
291 |
+
if isinstance(progress, str) and progress.startswith("Error"):
|
292 |
+
return progress, True, {"visibility": "hidden"}
|
293 |
+
|
294 |
+
if isinstance(progress, str) and progress.startswith("Processing"):
|
295 |
+
return progress, False, {"visibility": "visible"}
|
296 |
+
|
297 |
+
# PDF generation complete
|
298 |
try:
|
299 |
+
encoded = base64.b64encode(progress).decode()
|
300 |
return html.Div([
|
301 |
html.H4("PDF Generated Successfully"),
|
302 |
html.A(
|
|
|
304 |
href=f"data:application/pdf;base64,{encoded}",
|
305 |
download="website_content.pdf"
|
306 |
)
|
307 |
+
]), True, {"visibility": "hidden"}
|
308 |
except Exception as e:
|
309 |
logger.error(f"Error creating download link: {str(e)}")
|
310 |
+
return f"An error occurred while creating the download link: {str(e)}", True, {"visibility": "hidden"}
|
311 |
|
312 |
raise PreventUpdate
|
313 |
|
314 |
@app.callback(
|
|
|
315 |
Output('progress-store', 'data'),
|
316 |
+
Input('progress-interval', 'n_intervals'),
|
|
|
|
|
317 |
prevent_initial_call=True
|
318 |
)
|
319 |
+
def update_progress(n):
|
320 |
+
# This function will be called every second to update the progress
|
321 |
+
# You can implement a mechanism to check the actual progress of the PDF generation
|
322 |
+
# For now, we'll just return a placeholder message
|
323 |
+
return "Processing... Please wait."
|
324 |
|
325 |
+
def background_task(url, depth, task_id):
|
326 |
def progress_callback(message):
|
327 |
+
# Update progress in the database or a shared data structure
|
328 |
+
pass
|
|
|
329 |
|
330 |
+
try:
|
331 |
+
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
332 |
+
# Store the result in a database or shared data structure
|
333 |
+
# For simplicity, we'll use the progress-store, but in a real application,
|
334 |
+
# you should use a more robust solution for storing large data
|
335 |
+
app.layout.children[1].data = pdf_content
|
336 |
+
except Exception as e:
|
337 |
+
logger.error(f"Error in background task: {str(e)}")
|
338 |
+
app.layout.children[1].data = f"Error: {str(e)}"
|
339 |
|
340 |
if __name__ == '__main__':
|
341 |
print("Starting the Dash application...")
|