bluenevus commited on
Commit
1748e66
·
verified ·
1 Parent(s): fb64829

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -9
app.py CHANGED
@@ -5,17 +5,21 @@ from urllib.parse import urljoin, urlparse
5
  from fpdf import FPDF
6
  import tempfile
7
  import re
 
 
 
 
8
 
9
  def clean_text(text):
10
- # Remove any non-printable characters
11
  text = ''.join(char for char in text if char.isprintable())
12
- # Replace any remaining problematic characters
13
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
14
  return text
15
 
16
  def get_page_content(url):
17
  try:
18
- response = requests.get(url)
 
 
19
  soup = BeautifulSoup(response.text, 'html.parser')
20
  content = []
21
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
@@ -23,8 +27,10 @@ def get_page_content(url):
23
  text = clean_text(element.get_text(strip=True))
24
  if text:
25
  content.append(text)
 
26
  return content
27
  except Exception as e:
 
28
  return [f"Error processing {url}: {str(e)}"]
29
 
30
  def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
@@ -35,7 +41,8 @@ def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
35
  pages = [(url, get_page_content(url))]
36
 
37
  try:
38
- response = requests.get(url)
 
39
  soup = BeautifulSoup(response.text, 'html.parser')
40
 
41
  for link in soup.find_all('a'):
@@ -47,14 +54,16 @@ def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
47
  if len(visited) >= max_pages:
48
  break
49
  except Exception as e:
50
- print(f"Error processing {url}: {e}")
51
 
52
  return pages
53
 
54
  def website_to_pdf(url):
55
  parsed_url = urlparse(url)
56
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
 
57
  all_pages = get_subdirectory_pages(base_url, base_url)
 
58
 
59
  pdf = FPDF()
60
  pdf.add_page()
@@ -63,20 +72,22 @@ def website_to_pdf(url):
63
  for page_url, content in all_pages:
64
  pdf.cell(200, 10, txt=page_url, ln=True)
65
  for text in content:
66
- pdf.multi_cell(0, 10, txt=text)
67
  pdf.add_page()
68
 
69
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
70
- pdf.output(tmp.name)
71
- output_file = tmp.name
 
72
 
73
- return output_file
74
 
75
  def process_url(url):
76
  try:
77
  pdf_file = website_to_pdf(url)
78
  return pdf_file
79
  except Exception as e:
 
80
  return f"An error occurred: {str(e)}"
81
 
82
  iface = gr.Interface(
 
5
  from fpdf import FPDF
6
  import tempfile
7
  import re
8
+ import logging
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
 
13
  def clean_text(text):
 
14
  text = ''.join(char for char in text if char.isprintable())
 
15
  text = re.sub(r'[^\x00-\x7F]+', ' ', text)
16
  return text
17
 
18
  def get_page_content(url):
19
  try:
20
+ logger.info(f"Fetching content from: {url}")
21
+ response = requests.get(url, timeout=10)
22
+ response.raise_for_status()
23
  soup = BeautifulSoup(response.text, 'html.parser')
24
  content = []
25
  for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
 
27
  text = clean_text(element.get_text(strip=True))
28
  if text:
29
  content.append(text)
30
+ logger.info(f"Found {len(content)} content items for {url}")
31
  return content
32
  except Exception as e:
33
+ logger.error(f"Error processing {url}: {str(e)}")
34
  return [f"Error processing {url}: {str(e)}"]
35
 
36
  def get_subdirectory_pages(url, base_url, visited=set(), max_pages=100):
 
41
  pages = [(url, get_page_content(url))]
42
 
43
  try:
44
+ response = requests.get(url, timeout=10)
45
+ response.raise_for_status()
46
  soup = BeautifulSoup(response.text, 'html.parser')
47
 
48
  for link in soup.find_all('a'):
 
54
  if len(visited) >= max_pages:
55
  break
56
  except Exception as e:
57
+ logger.error(f"Error processing {url}: {e}")
58
 
59
  return pages
60
 
61
  def website_to_pdf(url):
62
  parsed_url = urlparse(url)
63
  base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path.rstrip('/')}/"
64
+ logger.info(f"Starting to process: {base_url}")
65
  all_pages = get_subdirectory_pages(base_url, base_url)
66
+ logger.info(f"Found {len(all_pages)} pages to process")
67
 
68
  pdf = FPDF()
69
  pdf.add_page()
 
72
  for page_url, content in all_pages:
73
  pdf.cell(200, 10, txt=page_url, ln=True)
74
  for text in content:
75
+ pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
76
  pdf.add_page()
77
 
78
  with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
79
+ pdf_path = tmp.name
80
+ pdf.output(pdf_path)
81
+ logger.info(f"PDF saved to: {pdf_path}")
82
 
83
+ return pdf_path
84
 
85
  def process_url(url):
86
  try:
87
  pdf_file = website_to_pdf(url)
88
  return pdf_file
89
  except Exception as e:
90
+ logger.error(f"Error in process_url: {str(e)}")
91
  return f"An error occurred: {str(e)}"
92
 
93
  iface = gr.Interface(