bluenevus commited on
Commit
04286a4
·
verified ·
1 Parent(s): 13c4089

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -41
app.py CHANGED
@@ -22,64 +22,46 @@ def get_page_content(url):
22
  response.raise_for_status()
23
  soup = BeautifulSoup(response.text, 'html.parser')
24
  content = []
25
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
26
- for element in soup.find_all(tag):
27
- text = clean_text(element.get_text(strip=True))
28
- if text:
29
- content.append(text)
 
 
30
  logger.info(f"Found {len(content)} content items for {url}")
31
  return content
32
  except Exception as e:
33
  logger.error(f"Error processing {url}: {str(e)}")
34
  return [f"Error processing {url}: {str(e)}"]
35
 
36
- def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
37
- if url in visited or len(visited) >= max_pages:
38
- return []
39
-
40
- visited.add(url)
41
- pages = [(url, get_page_content(url))]
42
-
43
  try:
44
  response = requests.get(url, timeout=10)
45
  response.raise_for_status()
46
  soup = BeautifulSoup(response.text, 'html.parser')
47
-
48
- for link in soup.find_all('a'):
49
- href = link.get('href')
50
- if href:
51
- full_url = urljoin(base_url, href)
52
- parsed_full_url = urlparse(full_url)
53
- parsed_base_url = urlparse(base_url)
54
-
55
- # Check if the URL is in the same directory or a direct subdirectory
56
- if (parsed_full_url.scheme == parsed_base_url.scheme and
57
- parsed_full_url.netloc == parsed_base_url.netloc and
58
- parsed_full_url.path.startswith(parsed_base_url.path) and
59
- parsed_full_url.path.count('/') <= parsed_base_url.path.count('/') + 1):
60
-
61
- if full_url not in visited:
62
- pages.extend(get_subdirectory_pages(full_url, base_url, visited, max_pages))
63
- if len(visited) >= max_pages:
64
- break
65
  except Exception as e:
66
- logger.error(f"Error processing {url}: {e}")
67
-
68
- return pages
69
 
70
  def website_to_pdf(url):
71
- parsed_url = urlparse(url)
72
- base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
73
  logger.info(f"Starting to process: {base_url}")
74
- all_pages = get_subdirectory_pages(base_url, base_url)
75
- logger.info(f"Found {len(all_pages)} pages to process")
76
 
77
  pdf = FPDF()
78
  pdf.set_auto_page_break(auto=True, margin=15)
79
  pdf.add_page()
80
  pdf.set_font("Arial", size=12)
81
 
82
- for page_url, content in all_pages:
 
83
  pdf.cell(0, 10, txt=page_url, ln=True)
84
  pdf.ln(5)
85
  for text in content:
@@ -103,10 +85,10 @@ def process_url(url):
103
 
104
  iface = gr.Interface(
105
  fn=process_url,
106
- inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs/gradio)"),
107
  outputs=gr.File(label="Download PDF"),
108
- title="Website Subdirectory to PDF Converter",
109
- description="Enter a website URL to convert its subdirectories into a PDF."
110
  )
111
 
112
  if __name__ == "__main__":
 
22
  response.raise_for_status()
23
  soup = BeautifulSoup(response.text, 'html.parser')
24
  content = []
25
+ main_content = soup.find('main')
26
+ if main_content:
27
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
28
+ for element in main_content.find_all(tag):
29
+ text = clean_text(element.get_text(strip=True))
30
+ if text:
31
+ content.append(text)
32
  logger.info(f"Found {len(content)} content items for {url}")
33
  return content
34
  except Exception as e:
35
  logger.error(f"Error processing {url}: {str(e)}")
36
  return [f"Error processing {url}: {str(e)}"]
37
 
38
+ def get_all_doc_links(url):
 
 
 
 
 
 
39
  try:
40
  response = requests.get(url, timeout=10)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, 'html.parser')
43
+ main_content = soup.find('main')
44
+ if main_content:
45
+ links = main_content.find_all('a', href=True)
46
+ return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
47
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
+ logger.error(f"Error getting links from {url}: {str(e)}")
50
+ return []
 
51
 
52
  def website_to_pdf(url):
53
+ base_url = "https://www.gradio.app/docs"
 
54
  logger.info(f"Starting to process: {base_url}")
55
+ all_links = get_all_doc_links(base_url)
56
+ logger.info(f"Found {len(all_links)} pages to process")
57
 
58
  pdf = FPDF()
59
  pdf.set_auto_page_break(auto=True, margin=15)
60
  pdf.add_page()
61
  pdf.set_font("Arial", size=12)
62
 
63
+ for page_url in all_links:
64
+ content = get_page_content(page_url)
65
  pdf.cell(0, 10, txt=page_url, ln=True)
66
  pdf.ln(5)
67
  for text in content:
 
85
 
86
  iface = gr.Interface(
87
  fn=process_url,
88
+ inputs=gr.Textbox(label="Enter website URL (e.g., https://www.gradio.app/docs)"),
89
  outputs=gr.File(label="Download PDF"),
90
+ title="Gradio Documentation to PDF Converter",
91
+ description="Enter the Gradio docs URL to convert all documentation pages into a PDF."
92
  )
93
 
94
  if __name__ == "__main__":