bluenevus commited on
Commit
cb8ca6c
·
verified ·
1 Parent(s): d6fec81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -22
app.py CHANGED
@@ -22,7 +22,7 @@ def get_page_content(url):
22
  response.raise_for_status()
23
  soup = BeautifulSoup(response.text, 'html.parser')
24
  content = []
25
- main_content = soup.find('main')
26
  if main_content:
27
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
28
  for element in main_content.find_all(tag):
@@ -35,37 +35,48 @@ def get_page_content(url):
35
  logger.error(f"Error processing {url}: {str(e)}")
36
  return [f"Error processing {url}: {str(e)}"]
37
 
38
- def get_all_doc_links(url):
39
  try:
40
- logger.info(f"Fetching links from: {url}")
41
  response = requests.get(url, timeout=10)
42
  response.raise_for_status()
43
  soup = BeautifulSoup(response.text, 'html.parser')
44
- main_content = soup.find('main')
45
- if main_content:
46
- links = main_content.find_all('a', href=True)
47
- doc_links = [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
48
- logger.info(f"Found {len(doc_links)} documentation links")
49
- return doc_links
50
- logger.warning("No main content found on the page")
51
- return []
52
  except Exception as e:
53
  logger.error(f"Error getting links from {url}: {str(e)}")
54
  return []
55
 
56
- def website_to_pdf(url):
57
- base_url = "https://www.gradio.app/docs"
58
- logger.info(f"Starting to process: {base_url}")
59
- all_links = get_all_doc_links(base_url)
60
- logger.info(f"Found {len(all_links)} pages to process")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  pdf = FPDF()
63
  pdf.set_auto_page_break(auto=True, margin=15)
64
  pdf.add_page()
65
  pdf.set_font("Arial", size=12)
66
 
67
- for page_url in all_links:
68
- content = get_page_content(page_url)
69
  pdf.cell(0, 10, txt=page_url, ln=True)
70
  pdf.ln(5)
71
  for text in content:
@@ -82,9 +93,9 @@ def website_to_pdf(url):
82
 
83
  return pdf_path
84
 
85
- def process_url(url):
86
  try:
87
- pdf_file = website_to_pdf(url)
88
  return pdf_file
89
  except Exception as e:
90
  logger.error(f"Error in process_url: {str(e)}")
@@ -92,10 +103,13 @@ def process_url(url):
92
 
93
  iface = gr.Interface(
94
  fn=process_url,
95
- inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
 
 
 
96
  outputs=gr.File(label="Download PDF"),
97
  title="Gradio Documentation to PDF Converter",
98
- description="Enter the Gradio docs URL to convert all documentation pages into a PDF."
99
  )
100
 
101
  if __name__ == "__main__":
 
22
  response.raise_for_status()
23
  soup = BeautifulSoup(response.text, 'html.parser')
24
  content = []
25
+ main_content = soup.find('article') or soup.find('main') or soup
26
  if main_content:
27
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
28
  for element in main_content.find_all(tag):
 
35
  logger.error(f"Error processing {url}: {str(e)}")
36
  return [f"Error processing {url}: {str(e)}"]
37
 
38
+ def get_links(url):
39
  try:
 
40
  response = requests.get(url, timeout=10)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, 'html.parser')
43
+ links = soup.find_all('a', href=True)
44
+ return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
 
 
 
 
 
 
45
  except Exception as e:
46
  logger.error(f"Error getting links from {url}: {str(e)}")
47
  return []
48
 
49
+ def crawl_pages(base_url, max_depth):
50
+ visited = set()
51
+ to_visit = [(base_url, 0)]
52
+ all_pages = []
53
+
54
+ while to_visit:
55
+ current_url, depth = to_visit.pop(0)
56
+ if current_url in visited or depth > max_depth:
57
+ continue
58
+
59
+ visited.add(current_url)
60
+ content = get_page_content(current_url)
61
+ all_pages.append((current_url, content))
62
+
63
+ if depth < max_depth:
64
+ links = get_links(current_url)
65
+ to_visit.extend((link, depth + 1) for link in links if link not in visited)
66
+
67
+ return all_pages
68
+
69
+ def website_to_pdf(url, max_depth):
70
+ logger.info(f"Starting to process: {url} with max depth: {max_depth}")
71
+ all_pages = crawl_pages(url, max_depth)
72
+ logger.info(f"Found {len(all_pages)} pages to process")
73
 
74
  pdf = FPDF()
75
  pdf.set_auto_page_break(auto=True, margin=15)
76
  pdf.add_page()
77
  pdf.set_font("Arial", size=12)
78
 
79
+ for page_url, content in all_pages:
 
80
  pdf.cell(0, 10, txt=page_url, ln=True)
81
  pdf.ln(5)
82
  for text in content:
 
93
 
94
  return pdf_path
95
 
96
+ def process_url(url, depth):
97
  try:
98
+ pdf_file = website_to_pdf(url, depth)
99
  return pdf_file
100
  except Exception as e:
101
  logger.error(f"Error in process_url: {str(e)}")
 
103
 
104
  iface = gr.Interface(
105
  fn=process_url,
106
+ inputs=[
107
+ gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
108
+ gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Crawl Depth")
109
+ ],
110
  outputs=gr.File(label="Download PDF"),
111
  title="Gradio Documentation to PDF Converter",
112
+ description="Enter the Gradio docs URL and crawl depth to convert documentation pages into a PDF."
113
  )
114
 
115
  if __name__ == "__main__":