Spaces:
Sleeping
Sleeping
File size: 2,882 Bytes
6ecf729 9f222f2 56c5685 9f222f2 6ecf729 12928b4 9f222f2 12928b4 9f222f2 6ecf729 12928b4 6ecf729 9f222f2 6ecf729 9f222f2 de0ffde 12928b4 9f222f2 12928b4 9f222f2 0f462a3 56c5685 9f222f2 56c5685 6ecf729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from fpdf import FPDF
import tempfile
import re
def clean_text(text):
# Remove any non-printable characters
text = ''.join(char for char in text if char.isprintable())
# Replace any remaining problematic characters
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
return text
def get_page_content(url):
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
content = []
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
for element in soup.find_all(tag):
text = clean_text(element.get_text(strip=True))
if text:
content.append(text)
return content
except Exception as e:
return [f"Error processing {url}: {str(e)}"]
def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
if url in visited or len(visited) >= max_pages:
return []
visited.add(url)
pages = [(url, get_page_content(url))]
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
for link in soup.find_all('a'):
href = link.get('href')
if href:
full_url = urljoin(base_url, href)
if full_url.startswith(base_url) and full_url not in visited:
pages.extend(get_subdirectory_pages(full_url, base_url, visited, max_pages))
if len(visited) >= max_pages:
break
except Exception as e:
print(f"Error processing {url}: {e}")
return pages
def website_to_pdf(url):
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
all_pages = get_subdirectory_pages(base_url, base_url)
pdf = FPDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
for page_url, content in all_pages:
pdf.cell(200, 10, txt=page_url, ln=True)
for text in content:
pdf.multi_cell(0, 10, txt=text)
pdf.add_page()
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
pdf.output(tmp.name)
output_file = tmp.name
return output_file
def process_url(url):
try:
pdf_file = website_to_pdf(url)
return pdf_file
except Exception as e:
return f"An error occurred: {str(e)}"
iface = gr.Interface(
fn=process_url,
inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
outputs=gr.File(label="Download PDF"),
title="Website Subdirectory to PDF Converter",
description="Enter a website URL to convert its subdirectories into a PDF."
)
if __name__ == "__main__":
iface.launch() |