Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -19,13 +19,15 @@ import time
|
|
19 |
import os
|
20 |
import ssl
|
21 |
from io import BytesIO
|
|
|
|
|
22 |
|
23 |
# Initialize Dash app
|
24 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
25 |
server = app.server
|
26 |
|
27 |
# Logging setup
|
28 |
-
logging.basicConfig(level=logging.INFO)
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
31 |
# Thread-local storage for database connections
|
@@ -108,7 +110,7 @@ async def get_links(session, url, base_url):
|
|
108 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
109 |
return []
|
110 |
|
111 |
-
async def crawl_pages(base_url, max_depth):
|
112 |
visited = set()
|
113 |
to_visit = [(base_url, 0)]
|
114 |
all_pages = []
|
@@ -140,6 +142,9 @@ async def crawl_pages(base_url, max_depth):
|
|
140 |
all_pages.append((current_url, content))
|
141 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
142 |
|
|
|
|
|
|
|
143 |
if depth < max_depth:
|
144 |
links = await get_links(session, current_url, base_url)
|
145 |
for link in links:
|
@@ -151,48 +156,75 @@ async def crawl_pages(base_url, max_depth):
|
|
151 |
|
152 |
return all_pages
|
153 |
|
154 |
-
def
|
155 |
-
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
156 |
-
|
157 |
pdf = FPDF()
|
158 |
pdf.set_auto_page_break(auto=True, margin=15)
|
159 |
pdf.add_page()
|
160 |
pdf.set_font("Arial", size=12)
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
for
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
async def process_url(url, depth, progress_callback):
|
182 |
try:
|
183 |
-
all_pages = await crawl_pages(url, depth)
|
184 |
if not all_pages:
|
185 |
return "No pages were successfully crawled. Please check the URL and try again."
|
186 |
-
pdf_content = website_to_pdf(all_pages, progress_callback)
|
187 |
-
return pdf_content
|
|
|
|
|
|
|
188 |
except Exception as e:
|
189 |
logger.error(f"Error in process_url: {str(e)}")
|
190 |
return f"An error occurred: {str(e)}"
|
191 |
|
192 |
-
# App layout
|
193 |
# App layout
|
194 |
app.layout = dbc.Container([
|
195 |
-
dcc.Store(id='pdf-store'),
|
|
|
196 |
dbc.Navbar(
|
197 |
dbc.Container([
|
198 |
html.A(
|
@@ -218,6 +250,7 @@ app.layout = dbc.Container([
|
|
218 |
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
219 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
220 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
|
|
221 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
222 |
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
223 |
]),
|
@@ -228,13 +261,16 @@ app.layout = dbc.Container([
|
|
228 |
@app.callback(
|
229 |
Output("output-area", "children"),
|
230 |
Output("progress-interval", "disabled"),
|
|
|
|
|
231 |
Input("submit-button", "n_clicks"),
|
232 |
Input("progress-interval", "n_intervals"),
|
233 |
State("url-input", "value"),
|
234 |
State("depth-slider", "value"),
|
|
|
235 |
prevent_initial_call=True
|
236 |
)
|
237 |
-
def update_output(n_clicks, n_intervals, url, depth):
|
238 |
ctx = dash.callback_context
|
239 |
if not ctx.triggered:
|
240 |
raise PreventUpdate
|
@@ -243,17 +279,19 @@ def update_output(n_clicks, n_intervals, url, depth):
|
|
243 |
|
244 |
if triggered_id == "submit-button":
|
245 |
if not url:
|
246 |
-
return "Please enter a valid URL.", True
|
247 |
|
248 |
-
return
|
249 |
|
250 |
elif triggered_id == "progress-interval":
|
251 |
store = dash.callback_context.inputs.get('pdf-store', None)
|
252 |
-
if store is None
|
253 |
-
|
|
|
|
|
254 |
|
255 |
if isinstance(store, str) and store.startswith("Error"):
|
256 |
-
return store, True
|
257 |
|
258 |
try:
|
259 |
encoded = base64.b64encode(store).decode()
|
@@ -264,15 +302,16 @@ def update_output(n_clicks, n_intervals, url, depth):
|
|
264 |
href=f"data:application/pdf;base64,{encoded}",
|
265 |
download="website_content.pdf"
|
266 |
)
|
267 |
-
]), True
|
268 |
except Exception as e:
|
269 |
logger.error(f"Error creating download link: {str(e)}")
|
270 |
-
return f"An error occurred while creating the download link: {str(e)}", True
|
271 |
|
272 |
raise PreventUpdate
|
273 |
|
274 |
@app.callback(
|
275 |
Output('pdf-store', 'data'),
|
|
|
276 |
Input('submit-button', 'n_clicks'),
|
277 |
State('url-input', 'value'),
|
278 |
State('depth-slider', 'value'),
|
@@ -280,9 +319,9 @@ def update_output(n_clicks, n_intervals, url, depth):
|
|
280 |
)
|
281 |
def generate_pdf(n_clicks, url, depth):
|
282 |
if not url:
|
283 |
-
return "Please enter a valid URL."
|
284 |
|
285 |
-
progress_store = {'progress': 0}
|
286 |
|
287 |
def progress_callback(message):
|
288 |
progress_store['progress'] = message
|
@@ -290,9 +329,9 @@ def generate_pdf(n_clicks, url, depth):
|
|
290 |
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
291 |
|
292 |
if isinstance(pdf_content, str):
|
293 |
-
return pdf_content # This is an error message
|
294 |
|
295 |
-
return pdf_content
|
296 |
|
297 |
if __name__ == '__main__':
|
298 |
print("Starting the Dash application...")
|
|
|
19 |
import os
|
20 |
import ssl
|
21 |
from io import BytesIO
|
22 |
+
from concurrent.futures import ThreadPoolExecutor
|
23 |
+
import math
|
24 |
|
25 |
# Initialize Dash app
|
26 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
27 |
server = app.server
|
28 |
|
29 |
# Logging setup
|
30 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
31 |
logger = logging.getLogger(__name__)
|
32 |
|
33 |
# Thread-local storage for database connections
|
|
|
110 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
111 |
return []
|
112 |
|
113 |
+
async def crawl_pages(base_url, max_depth, progress_callback):
|
114 |
visited = set()
|
115 |
to_visit = [(base_url, 0)]
|
116 |
all_pages = []
|
|
|
142 |
all_pages.append((current_url, content))
|
143 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
144 |
|
145 |
+
progress = len(all_pages) / (max_depth * 10) # Rough estimate
|
146 |
+
progress_callback(f"Crawling pages... {progress:.0%}")
|
147 |
+
|
148 |
if depth < max_depth:
|
149 |
links = await get_links(session, current_url, base_url)
|
150 |
for link in links:
|
|
|
156 |
|
157 |
return all_pages
|
158 |
|
159 |
+
def create_pdf_chunk(chunk, start_index):
|
|
|
|
|
160 |
pdf = FPDF()
|
161 |
pdf.set_auto_page_break(auto=True, margin=15)
|
162 |
pdf.add_page()
|
163 |
pdf.set_font("Arial", size=12)
|
164 |
|
165 |
+
for i, (page_url, content) in enumerate(chunk, start=start_index):
|
166 |
+
pdf.cell(0, 10, txt=f"Page {i+1}: {page_url}", ln=True)
|
167 |
+
pdf.ln(5)
|
168 |
+
for text in content:
|
169 |
+
try:
|
170 |
+
pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
|
171 |
+
except Exception as e:
|
172 |
+
logger.error(f"Error writing text to PDF: {str(e)}")
|
173 |
+
pdf.add_page()
|
174 |
+
|
175 |
+
return pdf.output(dest='S').encode('latin-1')
|
176 |
+
|
177 |
+
async def website_to_pdf(all_pages, progress_callback):
|
178 |
+
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
179 |
+
|
180 |
+
chunk_size = 100
|
181 |
+
num_chunks = math.ceil(len(all_pages) / chunk_size)
|
182 |
+
pdf_chunks = []
|
183 |
+
|
184 |
+
with ThreadPoolExecutor() as executor:
|
185 |
+
futures = []
|
186 |
+
for i in range(num_chunks):
|
187 |
+
start = i * chunk_size
|
188 |
+
end = min((i + 1) * chunk_size, len(all_pages))
|
189 |
+
chunk = all_pages[start:end]
|
190 |
+
future = executor.submit(create_pdf_chunk, chunk, start)
|
191 |
+
futures.append(future)
|
192 |
+
|
193 |
+
for i, future in enumerate(futures):
|
194 |
+
try:
|
195 |
+
pdf_chunk = await asyncio.wrap_future(future)
|
196 |
+
pdf_chunks.append(pdf_chunk)
|
197 |
+
progress = (i + 1) / num_chunks
|
198 |
+
progress_callback(f"Generating PDF... {progress:.0%}")
|
199 |
+
except Exception as e:
|
200 |
+
logger.error(f"Error generating PDF chunk {i}: {str(e)}")
|
201 |
+
|
202 |
+
# Combine PDF chunks
|
203 |
+
combined_pdf = FPDF()
|
204 |
+
for chunk in pdf_chunks:
|
205 |
+
combined_pdf.add_page()
|
206 |
+
combined_pdf.put_file(chunk)
|
207 |
+
|
208 |
+
return combined_pdf.output(dest='S').encode('latin-1')
|
209 |
|
210 |
async def process_url(url, depth, progress_callback):
|
211 |
try:
|
212 |
+
all_pages = await asyncio.wait_for(crawl_pages(url, depth, progress_callback), timeout=3600) # 1 hour timeout
|
213 |
if not all_pages:
|
214 |
return "No pages were successfully crawled. Please check the URL and try again."
|
215 |
+
pdf_content = await asyncio.wait_for(website_to_pdf(all_pages, progress_callback), timeout=3600) # 1 hour timeout for PDF generation
|
216 |
+
return pdf_content
|
217 |
+
except asyncio.TimeoutError:
|
218 |
+
logger.error("Process timed out after 1 hour")
|
219 |
+
return "The process timed out after 1 hour. Please try again with a smaller depth or a more specific URL."
|
220 |
except Exception as e:
|
221 |
logger.error(f"Error in process_url: {str(e)}")
|
222 |
return f"An error occurred: {str(e)}"
|
223 |
|
|
|
224 |
# App layout
|
225 |
app.layout = dbc.Container([
|
226 |
+
dcc.Store(id='pdf-store'),
|
227 |
+
dcc.Store(id='progress-store'),
|
228 |
dbc.Navbar(
|
229 |
dbc.Container([
|
230 |
html.A(
|
|
|
250 |
dbc.Input(id="url-input", type="text", placeholder="Enter website URL (e.g., https://www.gradio.app/docs)", className="mb-3"),
|
251 |
dcc.Slider(id="depth-slider", min=1, max=10, step=1, value=3, marks={i: str(i) for i in range(1, 11)}, className="mb-3"),
|
252 |
dbc.Button("Convert to PDF", id="submit-button", color="primary", className="mb-3 w-100"),
|
253 |
+
dbc.Progress(id="progress-bar", animated=True, striped=True, className="mb-3"),
|
254 |
dbc.Spinner(html.Div(id="output-area"), color="primary", type="grow"),
|
255 |
dcc.Interval(id="progress-interval", interval=1000, n_intervals=0, disabled=True),
|
256 |
]),
|
|
|
261 |
@app.callback(
|
262 |
Output("output-area", "children"),
|
263 |
Output("progress-interval", "disabled"),
|
264 |
+
Output("progress-bar", "value"),
|
265 |
+
Output("progress-bar", "label"),
|
266 |
Input("submit-button", "n_clicks"),
|
267 |
Input("progress-interval", "n_intervals"),
|
268 |
State("url-input", "value"),
|
269 |
State("depth-slider", "value"),
|
270 |
+
State("progress-store", "data"),
|
271 |
prevent_initial_call=True
|
272 |
)
|
273 |
+
def update_output(n_clicks, n_intervals, url, depth, progress):
|
274 |
ctx = dash.callback_context
|
275 |
if not ctx.triggered:
|
276 |
raise PreventUpdate
|
|
|
279 |
|
280 |
if triggered_id == "submit-button":
|
281 |
if not url:
|
282 |
+
return "Please enter a valid URL.", True, 0, ""
|
283 |
|
284 |
+
return "Processing... Please wait.", False, 0, "0%"
|
285 |
|
286 |
elif triggered_id == "progress-interval":
|
287 |
store = dash.callback_context.inputs.get('pdf-store', None)
|
288 |
+
if store is None:
|
289 |
+
if progress:
|
290 |
+
return "Processing... Please wait.", False, int(progress.split('%')[0]), progress
|
291 |
+
return "Processing... Please wait.", False, 0, "0%"
|
292 |
|
293 |
if isinstance(store, str) and store.startswith("Error"):
|
294 |
+
return store, True, 100, "100%"
|
295 |
|
296 |
try:
|
297 |
encoded = base64.b64encode(store).decode()
|
|
|
302 |
href=f"data:application/pdf;base64,{encoded}",
|
303 |
download="website_content.pdf"
|
304 |
)
|
305 |
+
]), True, 100, "100%"
|
306 |
except Exception as e:
|
307 |
logger.error(f"Error creating download link: {str(e)}")
|
308 |
+
return f"An error occurred while creating the download link: {str(e)}", True, 100, "100%"
|
309 |
|
310 |
raise PreventUpdate
|
311 |
|
312 |
@app.callback(
|
313 |
Output('pdf-store', 'data'),
|
314 |
+
Output('progress-store', 'data'),
|
315 |
Input('submit-button', 'n_clicks'),
|
316 |
State('url-input', 'value'),
|
317 |
State('depth-slider', 'value'),
|
|
|
319 |
)
|
320 |
def generate_pdf(n_clicks, url, depth):
|
321 |
if not url:
|
322 |
+
return "Please enter a valid URL.", "0%"
|
323 |
|
324 |
+
progress_store = {'progress': "0%"}
|
325 |
|
326 |
def progress_callback(message):
|
327 |
progress_store['progress'] = message
|
|
|
329 |
pdf_content = asyncio.run(process_url(url, depth, progress_callback))
|
330 |
|
331 |
if isinstance(pdf_content, str):
|
332 |
+
return pdf_content, "100%" # This is an error message
|
333 |
|
334 |
+
return pdf_content, "100%"
|
335 |
|
336 |
if __name__ == '__main__':
|
337 |
print("Starting the Dash application...")
|