bluenevus commited on
Commit
d0bb3ed
·
verified ·
1 Parent(s): cb8ca6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -4
app.py CHANGED
@@ -35,13 +35,18 @@ def get_page_content(url):
35
  logger.error(f"Error processing {url}: {str(e)}")
36
  return [f"Error processing {url}: {str(e)}"]
37
 
38
- def get_links(url):
39
  try:
40
  response = requests.get(url, timeout=10)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, 'html.parser')
43
  links = soup.find_all('a', href=True)
44
- return [urljoin(url, link['href']) for link in links if link['href'].startswith('/docs')]
 
 
 
 
 
45
  except Exception as e:
46
  logger.error(f"Error getting links from {url}: {str(e)}")
47
  return []
@@ -59,10 +64,13 @@ def crawl_pages(base_url, max_depth):
59
  visited.add(current_url)
60
  content = get_page_content(current_url)
61
  all_pages.append((current_url, content))
 
62
 
63
  if depth < max_depth:
64
- links = get_links(current_url)
65
- to_visit.extend((link, depth + 1) for link in links if link not in visited)
 
 
66
 
67
  return all_pages
68
 
 
35
  logger.error(f"Error processing {url}: {str(e)}")
36
  return [f"Error processing {url}: {str(e)}"]
37
 
38
+ def get_links(url, base_url):
39
  try:
40
  response = requests.get(url, timeout=10)
41
  response.raise_for_status()
42
  soup = BeautifulSoup(response.text, 'html.parser')
43
  links = soup.find_all('a', href=True)
44
+ valid_links = []
45
+ for link in links:
46
+ full_url = urljoin(url, link['href'])
47
+ if full_url.startswith(base_url) and full_url != url:
48
+ valid_links.append(full_url)
49
+ return valid_links
50
  except Exception as e:
51
  logger.error(f"Error getting links from {url}: {str(e)}")
52
  return []
 
64
  visited.add(current_url)
65
  content = get_page_content(current_url)
66
  all_pages.append((current_url, content))
67
+ logger.info(f"Processed page: {current_url} at depth {depth}")
68
 
69
  if depth < max_depth:
70
+ links = get_links(current_url, base_url)
71
+ for link in links:
72
+ if link not in visited:
73
+ to_visit.append((link, depth + 1))
74
 
75
  return all_pages
76