bluenevus commited on
Commit
156e042
·
verified ·
1 Parent(s): a8a1bcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -25
app.py CHANGED
@@ -55,13 +55,24 @@ async def get_page_content(session, url):
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
- main_content = soup.find('main') or soup.find('div', class_='content') or soup
 
 
 
59
  if main_content:
60
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'pre', 'code']:
61
- for element in main_content.find_all(tag):
62
- text = clean_text(element.get_text(strip=True))
63
- if text:
64
- content.append(f"{tag.upper()}: {text}")
 
 
 
 
 
 
 
 
65
  logger.info(f"Found {len(content)} content items for {url}")
66
  return content
67
  else:
@@ -70,7 +81,7 @@ async def get_page_content(session, url):
70
  except Exception as e:
71
  logger.error(f"Error processing {url}: {str(e)}")
72
  return [f"Error processing {url}: {str(e)}"]
73
-
74
  async def get_links(session, url, base_url):
75
  try:
76
  async with rate_limiter:
@@ -105,28 +116,14 @@ async def crawl_pages(base_url, max_depth):
105
  continue
106
 
107
  visited.add(current_url)
108
- start_time = time.time()
109
-
110
- with get_db_connection() as conn:
111
- c = conn.cursor()
112
- c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
113
- result = c.fetchone()
114
-
115
- if result:
116
- content = eval(result[0]) # Convert string back to list
117
- else:
118
- content = await get_page_content(session, current_url)
119
- with get_db_connection() as conn:
120
- c = conn.cursor()
121
- c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
122
- conn.commit()
123
 
 
124
  all_pages.append((current_url, content))
125
- logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
126
 
127
  if depth < max_depth:
128
- links = await get_links(session, current_url, base_url)
129
- for link in links:
130
  if link not in visited and link not in [url for url, _ in to_visit]:
131
  to_visit.append((link, depth + 1))
132
 
 
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
+
59
+ # Look for the main content area
60
+ main_content = soup.find('div', class_='toc')
61
+
62
  if main_content:
63
+ # Extract section titles and links
64
+ for section in main_content.find_all('div', class_='toc--section'):
65
+ title = section.find('h2', class_='toc-title-border')
66
+ if title:
67
+ content.append(f"H2: {title.text.strip()}")
68
+
69
+ links_store = section.find('div', id=lambda x: x and x.startswith("{'type': 'links-store'"))
70
+ if links_store:
71
+ links = links_store.find_next('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
72
+ if links:
73
+ for link in links.find_all('a'):
74
+ content.append(f"LINK: {link.text.strip()} - {urljoin(url, link['href'])}")
75
+
76
  logger.info(f"Found {len(content)} content items for {url}")
77
  return content
78
  else:
 
81
  except Exception as e:
82
  logger.error(f"Error processing {url}: {str(e)}")
83
  return [f"Error processing {url}: {str(e)}"]
84
+
85
  async def get_links(session, url, base_url):
86
  try:
87
  async with rate_limiter:
 
116
  continue
117
 
118
  visited.add(current_url)
119
+ logger.info(f"Crawling: {current_url} at depth {depth}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ content = await get_page_content(session, current_url)
122
  all_pages.append((current_url, content))
 
123
 
124
  if depth < max_depth:
125
+ new_links = [link.split(' - ')[1] for link in content if link.startswith('LINK:')]
126
+ for link in new_links:
127
  if link not in visited and link not in [url for url, _ in to_visit]:
128
  to_visit.append((link, depth + 1))
129