Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,87 +7,111 @@ import tempfile
|
|
7 |
import re
|
8 |
import logging
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
|
|
|
|
10 |
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
def clean_text(text):
|
15 |
text = ''.join(char for char in text if char.isprintable())
|
16 |
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
17 |
return text
|
18 |
|
19 |
-
def get_page_content(url):
|
20 |
try:
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
35 |
except Exception as e:
|
36 |
logger.error(f"Error processing {url}: {str(e)}")
|
37 |
return [f"Error processing {url}: {str(e)}"]
|
38 |
|
39 |
-
def get_links(url, base_url):
|
40 |
try:
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
51 |
except Exception as e:
|
52 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
53 |
return []
|
54 |
|
55 |
-
def crawl_pages(base_url, max_depth):
|
56 |
visited = set()
|
57 |
to_visit = [(base_url, 0)]
|
58 |
all_pages = []
|
59 |
|
60 |
-
|
61 |
-
content = get_page_content(url)
|
62 |
-
logger.info(f"Processed page: {url} at depth {depth}")
|
63 |
-
return url, content, depth
|
64 |
-
|
65 |
-
with ThreadPoolExecutor(max_workers=10) as executor:
|
66 |
-
futures = []
|
67 |
while to_visit:
|
68 |
current_url, depth = to_visit.pop(0)
|
69 |
if current_url in visited or depth > max_depth:
|
70 |
continue
|
71 |
|
72 |
visited.add(current_url)
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
if depth < max_depth:
|
76 |
-
links = get_links(current_url, base_url)
|
77 |
for link in links:
|
78 |
if link not in visited:
|
79 |
to_visit.append((link, depth + 1))
|
80 |
|
81 |
-
for future in as_completed(futures):
|
82 |
-
url, content, depth = future.result()
|
83 |
-
all_pages.append((url, content))
|
84 |
-
|
85 |
return all_pages
|
86 |
|
87 |
-
def website_to_pdf(
|
88 |
-
logger.info(f"Starting
|
89 |
-
all_pages = crawl_pages(url, max_depth)
|
90 |
-
logger.info(f"Found {len(all_pages)} pages to process")
|
91 |
|
92 |
pdf = FPDF()
|
93 |
pdf.set_auto_page_break(auto=True, margin=15)
|
@@ -111,24 +135,23 @@ def website_to_pdf(url, max_depth):
|
|
111 |
|
112 |
return pdf_path
|
113 |
|
114 |
-
def process_url(url, depth):
|
115 |
try:
|
116 |
-
|
|
|
117 |
return pdf_file
|
118 |
except Exception as e:
|
119 |
logger.error(f"Error in process_url: {str(e)}")
|
120 |
return f"An error occurred: {str(e)}"
|
121 |
|
122 |
-
def
|
123 |
-
|
124 |
-
future = executor.submit(process_url, url, depth)
|
125 |
-
return future.result()
|
126 |
|
127 |
iface = gr.Interface(
|
128 |
-
fn=
|
129 |
inputs=[
|
130 |
gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
|
131 |
-
gr.Slider(minimum=1, maximum=
|
132 |
],
|
133 |
outputs=gr.File(label="Download PDF"),
|
134 |
title="Website to PDF Converter",
|
|
|
7 |
import re
|
8 |
import logging
|
9 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
10 |
+
import asyncio
|
11 |
+
import aiohttp
|
12 |
+
from aiolimiter import AsyncLimiter
|
13 |
+
import sqlite3
|
14 |
|
15 |
logging.basicConfig(level=logging.INFO)
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
+
# Initialize SQLite database
|
19 |
+
conn = sqlite3.connect('crawl_cache.db')
|
20 |
+
c = conn.cursor()
|
21 |
+
c.execute('''CREATE TABLE IF NOT EXISTS pages
|
22 |
+
(url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
|
23 |
+
conn.commit()
|
24 |
+
|
25 |
+
# Rate limiter: 10 requests per second
|
26 |
+
rate_limiter = AsyncLimiter(10, 1)
|
27 |
+
|
28 |
def clean_text(text):
|
29 |
text = ''.join(char for char in text if char.isprintable())
|
30 |
text = re.sub(r'[^\x00-\x7F]+', ' ', text)
|
31 |
return text
|
32 |
|
33 |
+
async def get_page_content(session, url):
|
34 |
try:
|
35 |
+
async with rate_limiter:
|
36 |
+
async with session.get(url, timeout=30) as response:
|
37 |
+
if response.status == 200:
|
38 |
+
text = await response.text()
|
39 |
+
soup = BeautifulSoup(text, 'html.parser')
|
40 |
+
content = []
|
41 |
+
main_content = soup.find('article') or soup.find('main') or soup
|
42 |
+
if main_content:
|
43 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
44 |
+
for element in main_content.find_all(tag):
|
45 |
+
text = clean_text(element.get_text(strip=True))
|
46 |
+
if text:
|
47 |
+
content.append(text)
|
48 |
+
logger.info(f"Found {len(content)} content items for {url}")
|
49 |
+
return content
|
50 |
+
else:
|
51 |
+
logger.error(f"Error fetching {url}: HTTP {response.status}")
|
52 |
+
return [f"Error fetching {url}: HTTP {response.status}"]
|
53 |
except Exception as e:
|
54 |
logger.error(f"Error processing {url}: {str(e)}")
|
55 |
return [f"Error processing {url}: {str(e)}"]
|
56 |
|
57 |
+
async def get_links(session, url, base_url):
|
58 |
try:
|
59 |
+
async with rate_limiter:
|
60 |
+
async with session.get(url, timeout=30) as response:
|
61 |
+
if response.status == 200:
|
62 |
+
text = await response.text()
|
63 |
+
soup = BeautifulSoup(text, 'html.parser')
|
64 |
+
links = soup.find_all('a', href=True)
|
65 |
+
valid_links = []
|
66 |
+
for link in links:
|
67 |
+
full_url = urljoin(url, link['href'])
|
68 |
+
if full_url.startswith(base_url) and full_url != url:
|
69 |
+
valid_links.append(full_url)
|
70 |
+
return valid_links
|
71 |
+
else:
|
72 |
+
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
73 |
+
return []
|
74 |
except Exception as e:
|
75 |
logger.error(f"Error getting links from {url}: {str(e)}")
|
76 |
return []
|
77 |
|
78 |
+
async def crawl_pages(base_url, max_depth):
|
79 |
visited = set()
|
80 |
to_visit = [(base_url, 0)]
|
81 |
all_pages = []
|
82 |
|
83 |
+
async with aiohttp.ClientSession() as session:
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
while to_visit:
|
85 |
current_url, depth = to_visit.pop(0)
|
86 |
if current_url in visited or depth > max_depth:
|
87 |
continue
|
88 |
|
89 |
visited.add(current_url)
|
90 |
+
|
91 |
+
# Check if page is already in the database
|
92 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
93 |
+
result = c.fetchone()
|
94 |
+
if result:
|
95 |
+
content = eval(result[0]) # Convert string back to list
|
96 |
+
else:
|
97 |
+
content = await get_page_content(session, current_url)
|
98 |
+
# Store in database
|
99 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
100 |
+
conn.commit()
|
101 |
+
|
102 |
+
all_pages.append((current_url, content))
|
103 |
+
logger.info(f"Processed page: {current_url} at depth {depth}")
|
104 |
|
105 |
if depth < max_depth:
|
106 |
+
links = await get_links(session, current_url, base_url)
|
107 |
for link in links:
|
108 |
if link not in visited:
|
109 |
to_visit.append((link, depth + 1))
|
110 |
|
|
|
|
|
|
|
|
|
111 |
return all_pages
|
112 |
|
113 |
+
def website_to_pdf(all_pages):
|
114 |
+
logger.info(f"Starting PDF generation for {len(all_pages)} pages")
|
|
|
|
|
115 |
|
116 |
pdf = FPDF()
|
117 |
pdf.set_auto_page_break(auto=True, margin=15)
|
|
|
135 |
|
136 |
return pdf_path
|
137 |
|
138 |
+
async def process_url(url, depth):
|
139 |
try:
|
140 |
+
all_pages = await crawl_pages(url, depth)
|
141 |
+
pdf_file = website_to_pdf(all_pages)
|
142 |
return pdf_file
|
143 |
except Exception as e:
|
144 |
logger.error(f"Error in process_url: {str(e)}")
|
145 |
return f"An error occurred: {str(e)}"
|
146 |
|
147 |
+
def run_async(url, depth):
|
148 |
+
return asyncio.run(process_url(url, depth))
|
|
|
|
|
149 |
|
150 |
iface = gr.Interface(
|
151 |
+
fn=run_async,
|
152 |
inputs=[
|
153 |
gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
|
154 |
+
gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Crawl Depth")
|
155 |
],
|
156 |
outputs=gr.File(label="Download PDF"),
|
157 |
title="Website to PDF Converter",
|