bluenevus commited on
Commit
f63dcb0
·
verified ·
1 Parent(s): aa735f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -52
app.py CHANGED
@@ -7,87 +7,111 @@ import tempfile
7
  import re
8
  import logging
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
 
10
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
 
 
 
 
 
 
 
 
 
 
14
  def clean_text(text):
15
  text = ''.join(char for char in text if char.isprintable())
16
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
17
  return text
18
 
19
- def get_page_content(url):
20
  try:
21
- logger.info(f"Fetching content from: {url}")
22
- response = requests.get(url, timeout=10)
23
- response.raise_for_status()
24
- soup = BeautifulSoup(response.text, 'html.parser')
25
- content = []
26
- main_content = soup.find('article') or soup.find('main') or soup
27
- if main_content:
28
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
29
- for element in main_content.find_all(tag):
30
- text = clean_text(element.get_text(strip=True))
31
- if text:
32
- content.append(text)
33
- logger.info(f"Found {len(content)} content items for {url}")
34
- return content
 
 
 
 
35
  except Exception as e:
36
  logger.error(f"Error processing {url}: {str(e)}")
37
  return [f"Error processing {url}: {str(e)}"]
38
 
39
- def get_links(url, base_url):
40
  try:
41
- response = requests.get(url, timeout=10)
42
- response.raise_for_status()
43
- soup = BeautifulSoup(response.text, 'html.parser')
44
- links = soup.find_all('a', href=True)
45
- valid_links = []
46
- for link in links:
47
- full_url = urljoin(url, link['href'])
48
- if full_url.startswith(base_url) and full_url != url:
49
- valid_links.append(full_url)
50
- return valid_links
 
 
 
 
 
51
  except Exception as e:
52
  logger.error(f"Error getting links from {url}: {str(e)}")
53
  return []
54
 
55
- def crawl_pages(base_url, max_depth):
56
  visited = set()
57
  to_visit = [(base_url, 0)]
58
  all_pages = []
59
 
60
- def process_page(url, depth):
61
- content = get_page_content(url)
62
- logger.info(f"Processed page: {url} at depth {depth}")
63
- return url, content, depth
64
-
65
- with ThreadPoolExecutor(max_workers=10) as executor:
66
- futures = []
67
  while to_visit:
68
  current_url, depth = to_visit.pop(0)
69
  if current_url in visited or depth > max_depth:
70
  continue
71
 
72
  visited.add(current_url)
73
- futures.append(executor.submit(process_page, current_url, depth))
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  if depth < max_depth:
76
- links = get_links(current_url, base_url)
77
  for link in links:
78
  if link not in visited:
79
  to_visit.append((link, depth + 1))
80
 
81
- for future in as_completed(futures):
82
- url, content, depth = future.result()
83
- all_pages.append((url, content))
84
-
85
  return all_pages
86
 
87
- def website_to_pdf(url, max_depth):
88
- logger.info(f"Starting to process: {url} with max depth: {max_depth}")
89
- all_pages = crawl_pages(url, max_depth)
90
- logger.info(f"Found {len(all_pages)} pages to process")
91
 
92
  pdf = FPDF()
93
  pdf.set_auto_page_break(auto=True, margin=15)
@@ -111,24 +135,23 @@ def website_to_pdf(url, max_depth):
111
 
112
  return pdf_path
113
 
114
- def process_url(url, depth):
115
  try:
116
- pdf_file = website_to_pdf(url, depth)
 
117
  return pdf_file
118
  except Exception as e:
119
  logger.error(f"Error in process_url: {str(e)}")
120
  return f"An error occurred: {str(e)}"
121
 
122
- def threaded_process_url(url, depth):
123
- with ThreadPoolExecutor() as executor:
124
- future = executor.submit(process_url, url, depth)
125
- return future.result()
126
 
127
  iface = gr.Interface(
128
- fn=threaded_process_url,
129
  inputs=[
130
  gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
131
- gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
132
  ],
133
  outputs=gr.File(label="Download PDF"),
134
  title="Website to PDF Converter",
 
7
  import re
8
  import logging
9
  from concurrent.futures import ThreadPoolExecutor, as_completed
10
+ import asyncio
11
+ import aiohttp
12
+ from aiolimiter import AsyncLimiter
13
+ import sqlite3
14
 
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Initialize SQLite database
19
+ conn = sqlite3.connect('crawl_cache.db')
20
+ c = conn.cursor()
21
+ c.execute('''CREATE TABLE IF NOT EXISTS pages
22
+ (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
23
+ conn.commit()
24
+
25
+ # Rate limiter: 10 requests per second
26
+ rate_limiter = AsyncLimiter(10, 1)
27
+
28
  def clean_text(text):
29
  text = ''.join(char for char in text if char.isprintable())
30
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
31
  return text
32
 
33
+ async def get_page_content(session, url):
34
  try:
35
+ async with rate_limiter:
36
+ async with session.get(url, timeout=30) as response:
37
+ if response.status == 200:
38
+ text = await response.text()
39
+ soup = BeautifulSoup(text, 'html.parser')
40
+ content = []
41
+ main_content = soup.find('article') or soup.find('main') or soup
42
+ if main_content:
43
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
44
+ for element in main_content.find_all(tag):
45
+ text = clean_text(element.get_text(strip=True))
46
+ if text:
47
+ content.append(text)
48
+ logger.info(f"Found {len(content)} content items for {url}")
49
+ return content
50
+ else:
51
+ logger.error(f"Error fetching {url}: HTTP {response.status}")
52
+ return [f"Error fetching {url}: HTTP {response.status}"]
53
  except Exception as e:
54
  logger.error(f"Error processing {url}: {str(e)}")
55
  return [f"Error processing {url}: {str(e)}"]
56
 
57
+ async def get_links(session, url, base_url):
58
  try:
59
+ async with rate_limiter:
60
+ async with session.get(url, timeout=30) as response:
61
+ if response.status == 200:
62
+ text = await response.text()
63
+ soup = BeautifulSoup(text, 'html.parser')
64
+ links = soup.find_all('a', href=True)
65
+ valid_links = []
66
+ for link in links:
67
+ full_url = urljoin(url, link['href'])
68
+ if full_url.startswith(base_url) and full_url != url:
69
+ valid_links.append(full_url)
70
+ return valid_links
71
+ else:
72
+ logger.error(f"Error fetching links from {url}: HTTP {response.status}")
73
+ return []
74
  except Exception as e:
75
  logger.error(f"Error getting links from {url}: {str(e)}")
76
  return []
77
 
78
+ async def crawl_pages(base_url, max_depth):
79
  visited = set()
80
  to_visit = [(base_url, 0)]
81
  all_pages = []
82
 
83
+ async with aiohttp.ClientSession() as session:
 
 
 
 
 
 
84
  while to_visit:
85
  current_url, depth = to_visit.pop(0)
86
  if current_url in visited or depth > max_depth:
87
  continue
88
 
89
  visited.add(current_url)
90
+
91
+ # Check if page is already in the database
92
+ c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
93
+ result = c.fetchone()
94
+ if result:
95
+ content = eval(result[0]) # Convert string back to list
96
+ else:
97
+ content = await get_page_content(session, current_url)
98
+ # Store in database
99
+ c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
100
+ conn.commit()
101
+
102
+ all_pages.append((current_url, content))
103
+ logger.info(f"Processed page: {current_url} at depth {depth}")
104
 
105
  if depth < max_depth:
106
+ links = await get_links(session, current_url, base_url)
107
  for link in links:
108
  if link not in visited:
109
  to_visit.append((link, depth + 1))
110
 
 
 
 
 
111
  return all_pages
112
 
113
+ def website_to_pdf(all_pages):
114
+ logger.info(f"Starting PDF generation for {len(all_pages)} pages")
 
 
115
 
116
  pdf = FPDF()
117
  pdf.set_auto_page_break(auto=True, margin=15)
 
135
 
136
  return pdf_path
137
 
138
+ async def process_url(url, depth):
139
  try:
140
+ all_pages = await crawl_pages(url, depth)
141
+ pdf_file = website_to_pdf(all_pages)
142
  return pdf_file
143
  except Exception as e:
144
  logger.error(f"Error in process_url: {str(e)}")
145
  return f"An error occurred: {str(e)}"
146
 
147
+ def run_async(url, depth):
148
+ return asyncio.run(process_url(url, depth))
 
 
149
 
150
  iface = gr.Interface(
151
+ fn=run_async,
152
  inputs=[
153
  gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
154
+ gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Crawl Depth")
155
  ],
156
  outputs=gr.File(label="Download PDF"),
157
  title="Website to PDF Converter",