bluenevus commited on
Commit
35d836c
·
verified ·
1 Parent(s): ead5062

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -23
app.py CHANGED
@@ -55,17 +55,13 @@ async def get_page_content(session, url):
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
-
59
- # Look for the main content area
60
- main_content = soup.find('div', id='react-entry-point')
61
-
62
  if main_content:
63
- # Extract all text content
64
- for tag in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'code']):
65
- text = clean_text(tag.get_text(strip=True))
66
- if text:
67
- content.append(f"{tag.name.upper()}: {text}")
68
-
69
  logger.info(f"Found {len(content)} content items for {url}")
70
  return content
71
  else:
@@ -82,18 +78,12 @@ async def get_links(session, url, base_url):
82
  if response.status == 200:
83
  text = await response.text()
84
  soup = BeautifulSoup(text, 'html.parser')
 
85
  valid_links = []
86
-
87
- # Look for the main content area
88
- main_content = soup.find('div', id='react-entry-point')
89
-
90
- if main_content:
91
- for link in main_content.find_all('a', href=True):
92
- href = link['href']
93
- full_url = urljoin(base_url, href)
94
- if full_url.startswith(base_url) and full_url != url:
95
- valid_links.append(full_url)
96
-
97
  return valid_links
98
  else:
99
  logger.error(f"Error fetching links from {url}: HTTP {response.status}")
@@ -116,14 +106,27 @@ async def crawl_pages(base_url, max_depth):
116
  visited.add(current_url)
117
  start_time = time.time()
118
 
119
- content = await get_page_content(session, current_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  all_pages.append((current_url, content))
121
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
122
 
123
  if depth < max_depth:
124
  links = await get_links(session, current_url, base_url)
125
  for link in links:
126
- if link not in visited and link not in [url for url, _ in to_visit]:
127
  to_visit.append((link, depth + 1))
128
 
129
  return all_pages
 
55
  text = await response.text()
56
  soup = BeautifulSoup(text, 'html.parser')
57
  content = []
58
+ main_content = soup.find('article') or soup.find('main') or soup
 
 
 
59
  if main_content:
60
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
61
+ for element in main_content.find_all(tag):
62
+ text = clean_text(element.get_text(strip=True))
63
+ if text:
64
+ content.append(text)
 
65
  logger.info(f"Found {len(content)} content items for {url}")
66
  return content
67
  else:
 
78
  if response.status == 200:
79
  text = await response.text()
80
  soup = BeautifulSoup(text, 'html.parser')
81
+ links = soup.find_all('a', href=True)
82
  valid_links = []
83
+ for link in links:
84
+ full_url = urljoin(url, link['href'])
85
+ if full_url.startswith(base_url) and full_url != url:
86
+ valid_links.append(full_url)
 
 
 
 
 
 
 
87
  return valid_links
88
  else:
89
  logger.error(f"Error fetching links from {url}: HTTP {response.status}")
 
106
  visited.add(current_url)
107
  start_time = time.time()
108
 
109
+ with get_db_connection() as conn:
110
+ c = conn.cursor()
111
+ c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
112
+ result = c.fetchone()
113
+
114
+ if result:
115
+ content = eval(result[0]) # Convert string back to list
116
+ else:
117
+ content = await get_page_content(session, current_url)
118
+ with get_db_connection() as conn:
119
+ c = conn.cursor()
120
+ c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
121
+ conn.commit()
122
+
123
  all_pages.append((current_url, content))
124
  logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
125
 
126
  if depth < max_depth:
127
  links = await get_links(session, current_url, base_url)
128
  for link in links:
129
+ if link not in visited:
130
  to_visit.append((link, depth + 1))
131
 
132
  return all_pages