Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
from fpdf import FPDF | |
import tempfile | |
import re | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def clean_text(text): | |
text = ''.join(char for char in text if char.isprintable()) | |
text = re.sub(r'[^\x00-\x7F]+', ' ', text) | |
return text | |
def get_page_content(url): | |
try: | |
logger.info(f"Fetching content from: {url}") | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content = [] | |
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']: | |
for element in soup.find_all(tag): | |
text = clean_text(element.get_text(strip=True)) | |
if text: | |
content.append(text) | |
logger.info(f"Found {len(content)} content items for {url}") | |
return content | |
except Exception as e: | |
logger.error(f"Error processing {url}: {str(e)}") | |
return [f"Error processing {url}: {str(e)}"] | |
def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100): | |
if url in visited or len(visited) >= max_pages: | |
return [] | |
visited.add(url) | |
pages = [(url, get_page_content(url))] | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
soup = BeautifulSoup(response.text, 'html.parser') | |
for link in soup.find_all('a'): | |
href = link.get('href') | |
if href: | |
full_url = urljoin(base_url, href) | |
parsed_full_url = urlparse(full_url) | |
parsed_base_url = urlparse(base_url) | |
# Check if the URL is in the same directory or a direct subdirectory | |
if (parsed_full_url.scheme == parsed_base_url.scheme and | |
parsed_full_url.netloc == parsed_base_url.netloc and | |
parsed_full_url.path.startswith(parsed_base_url.path) and | |
parsed_full_url.path.count('/') <= parsed_base_url.path.count('/') + 1): | |
if full_url not in visited: | |
pages.extend(get_subdirectory_pages(full_url, base_url, visited, max_pages)) | |
if len(visited) >= max_pages: | |
break | |
except Exception as e: | |
logger.error(f"Error processing {url}: {e}") | |
return pages | |
def website_to_pdf(url): | |
parsed_url = urlparse(url) | |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/" | |
logger.info(f"Starting to process: {base_url}") | |
all_pages = get_subdirectory_pages(base_url, base_url) | |
logger.info(f"Found {len(all_pages)} pages to process") | |
pdf = FPDF() | |
pdf.set_auto_page_break(auto=True, margin=15) | |
pdf.add_page() | |
pdf.set_font("Arial", size=12) | |
for page_url, content in all_pages: | |
pdf.cell(0, 10, txt=page_url, ln=True) | |
pdf.ln(5) | |
for text in content: | |
pdf.multi_cell(0, 10, txt=text) | |
pdf.add_page() | |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
pdf_path = tmp.name | |
pdf.output(pdf_path) | |
logger.info(f"PDF saved to: {pdf_path}") | |
return pdf_path | |
def process_url(url): | |
try: | |
pdf_file = website_to_pdf(url) | |
return pdf_file | |
except Exception as e: | |
logger.error(f"Error in process_url: {str(e)}") | |
return f"An error occurred: {str(e)}" | |
iface = gr.Interface( | |
fn=process_url, | |
inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs/gradio)"), | |
outputs=gr.File(label="Download PDF"), | |
title="Website Subdirectory to PDF Converter", | |
description="Enter a website URL to convert its subdirectories into a PDF." | |
) | |
if __name__ == "__main__": | |
iface.launch() |