Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,11 +2,16 @@ import gradio as gr
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
-
from
|
6 |
-
from reportlab.lib.styles import getSampleStyleSheet
|
7 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph
|
8 |
-
from io import BytesIO
|
9 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def get_page_content(url):
|
12 |
try:
|
@@ -15,15 +20,15 @@ def get_page_content(url):
|
|
15 |
content = []
|
16 |
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
17 |
for element in soup.find_all(tag):
|
18 |
-
text = element.get_text(strip=True)
|
19 |
if text:
|
20 |
content.append(text)
|
21 |
return content
|
22 |
except Exception as e:
|
23 |
return [f"Error processing {url}: {str(e)}"]
|
24 |
|
25 |
-
def get_subdirectory_pages(url, base_url, visited=set()):
|
26 |
-
if url in visited:
|
27 |
return []
|
28 |
|
29 |
visited.add(url)
|
@@ -38,7 +43,9 @@ def get_subdirectory_pages(url, base_url, visited=set()):
|
|
38 |
if href:
|
39 |
full_url = urljoin(base_url, href)
|
40 |
if full_url.startswith(base_url) and full_url not in visited:
|
41 |
-
pages.extend(get_subdirectory_pages(full_url, base_url, visited))
|
|
|
|
|
42 |
except Exception as e:
|
43 |
print(f"Error processing {url}: {e}")
|
44 |
|
@@ -49,21 +56,18 @@ def website_to_pdf(url):
|
|
49 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
50 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
story = []
|
56 |
|
57 |
for page_url, content in all_pages:
|
58 |
-
|
59 |
for text in content:
|
60 |
-
|
61 |
-
|
62 |
|
63 |
-
doc.build(story)
|
64 |
-
|
65 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
66 |
-
|
67 |
output_file = tmp.name
|
68 |
|
69 |
return output_file
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
from urllib.parse import urljoin, urlparse
|
5 |
+
from fpdf import FPDF
|
|
|
|
|
|
|
6 |
import tempfile
|
7 |
+
import re
|
8 |
+
|
9 |
+
def clean_text(text):
|
10 |
+
# Remove any non-printable characters
|
11 |
+
text = ''.join(char for char in text if char.isprintable())
|
12 |
+
# Replace any remaining problematic characters
|
13 |
+
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
14 |
+
return text
|
15 |
|
16 |
def get_page_content(url):
|
17 |
try:
|
|
|
20 |
content = []
|
21 |
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
22 |
for element in soup.find_all(tag):
|
23 |
+
text = clean_text(element.get_text(strip=True))
|
24 |
if text:
|
25 |
content.append(text)
|
26 |
return content
|
27 |
except Exception as e:
|
28 |
return [f"Error processing {url}: {str(e)}"]
|
29 |
|
30 |
+
def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
|
31 |
+
if url in visited or len(visited) >= max_pages:
|
32 |
return []
|
33 |
|
34 |
visited.add(url)
|
|
|
43 |
if href:
|
44 |
full_url = urljoin(base_url, href)
|
45 |
if full_url.startswith(base_url) and full_url not in visited:
|
46 |
+
pages.extend(get_subdirectory_pages(full_url, base_url, visited, max_pages))
|
47 |
+
if len(visited) >= max_pages:
|
48 |
+
break
|
49 |
except Exception as e:
|
50 |
print(f"Error processing {url}: {e}")
|
51 |
|
|
|
56 |
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
|
57 |
all_pages = get_subdirectory_pages(base_url, base_url)
|
58 |
|
59 |
+
pdf = FPDF()
|
60 |
+
pdf.add_page()
|
61 |
+
pdf.set_font("Arial", size=12)
|
|
|
62 |
|
63 |
for page_url, content in all_pages:
|
64 |
+
pdf.cell(200, 10, txt=page_url, ln=True)
|
65 |
for text in content:
|
66 |
+
pdf.multi_cell(0, 10, txt=text)
|
67 |
+
pdf.add_page()
|
68 |
|
|
|
|
|
69 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
|
70 |
+
pdf.output(tmp.name)
|
71 |
output_file = tmp.name
|
72 |
|
73 |
return output_file
|