import gradio as gr import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from fpdf import FPDF import tempfile import re import logging from concurrent.futures import ThreadPoolExecutor, as_completed logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def clean_text(text): text = ''.join(char for char in text if char.isprintable()) text = re.sub(r'[^\x00-\x7F]+', ' ', text) return text def get_page_content(url): try: logger.info(f"Fetching content from: {url}") response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') content = [] main_content = soup.find('article') or soup.find('main') or soup if main_content: for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']: for element in main_content.find_all(tag): text = clean_text(element.get_text(strip=True)) if text: content.append(text) logger.info(f"Found {len(content)} content items for {url}") return content except Exception as e: logger.error(f"Error processing {url}: {str(e)}") return [f"Error processing {url}: {str(e)}"] def get_links(url, base_url): try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all('a', href=True) valid_links = [] for link in links: full_url = urljoin(url, link['href']) if full_url.startswith(base_url) and full_url != url: valid_links.append(full_url) return valid_links except Exception as e: logger.error(f"Error getting links from {url}: {str(e)}") return [] def crawl_pages(base_url, max_depth): visited = set() to_visit = [(base_url, 0)] all_pages = [] def process_page(url, depth): content = get_page_content(url) logger.info(f"Processed page: {url} at depth {depth}") return url, content, depth with ThreadPoolExecutor(max_workers=10) as executor: futures = [] while to_visit: current_url, depth = to_visit.pop(0) if current_url in visited or depth > max_depth: continue visited.add(current_url) futures.append(executor.submit(process_page, current_url, depth)) if depth < max_depth: links = get_links(current_url, base_url) for link in links: if link not in visited: to_visit.append((link, depth + 1)) for future in as_completed(futures): url, content, depth = future.result() all_pages.append((url, content)) return all_pages def website_to_pdf(url, max_depth): logger.info(f"Starting to process: {url} with max depth: {max_depth}") all_pages = crawl_pages(url, max_depth) logger.info(f"Found {len(all_pages)} pages to process") pdf = FPDF() pdf.set_auto_page_break(auto=True, margin=15) pdf.add_page() pdf.set_font("Arial", size=12) for page_url, content in all_pages: pdf.cell(0, 10, txt=page_url, ln=True) pdf.ln(5) for text in content: try: pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues except Exception as e: logger.error(f"Error writing text to PDF: {str(e)}") pdf.add_page() with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: pdf_path = tmp.name pdf.output(pdf_path) logger.info(f"PDF saved to: {pdf_path}") return pdf_path def process_url(url, depth): try: pdf_file = website_to_pdf(url, depth) return pdf_file except Exception as e: logger.error(f"Error in process_url: {str(e)}") return f"An error occurred: {str(e)}" def threaded_process_url(url, depth): with ThreadPoolExecutor() as executor: future = executor.submit(process_url, url, depth) return future.result() iface = gr.Interface( fn=threaded_process_url, inputs=[ gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"), gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth") ], outputs=gr.File(label="Download PDF"), title="Website to PDF Converter", description="Enter docs URL and crawl depth to convert documentation pages into a PDF. Be responsible for sites you have permission to do this" ) if __name__ == "__main__": iface.launch()