Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -55,17 +55,13 @@ async def get_page_content(session, url):
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
-
|
59 |
-
# Look for the main content area
|
60 |
-
main_content = soup.find('div', id='react-entry-point')
|
61 |
-
|
62 |
if main_content:
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
logger.info(f"Found {len(content)} content items for {url}")
|
70 |
return content
|
71 |
else:
|
@@ -82,18 +78,12 @@ async def get_links(session, url, base_url):
|
|
82 |
if response.status == 200:
|
83 |
text = await response.text()
|
84 |
soup = BeautifulSoup(text, 'html.parser')
|
|
|
85 |
valid_links = []
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
if main_content:
|
91 |
-
for link in main_content.find_all('a', href=True):
|
92 |
-
href = link['href']
|
93 |
-
full_url = urljoin(base_url, href)
|
94 |
-
if full_url.startswith(base_url) and full_url != url:
|
95 |
-
valid_links.append(full_url)
|
96 |
-
|
97 |
return valid_links
|
98 |
else:
|
99 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
@@ -116,14 +106,27 @@ async def crawl_pages(base_url, max_depth):
|
|
116 |
visited.add(current_url)
|
117 |
start_time = time.time()
|
118 |
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
all_pages.append((current_url, content))
|
121 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
122 |
|
123 |
if depth < max_depth:
|
124 |
links = await get_links(session, current_url, base_url)
|
125 |
for link in links:
|
126 |
-
if link not in visited
|
127 |
to_visit.append((link, depth + 1))
|
128 |
|
129 |
return all_pages
|
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
+
main_content = soup.find('article') or soup.find('main') or soup
|
|
|
|
|
|
|
59 |
if main_content:
|
60 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
61 |
+
for element in main_content.find_all(tag):
|
62 |
+
text = clean_text(element.get_text(strip=True))
|
63 |
+
if text:
|
64 |
+
content.append(text)
|
|
|
65 |
logger.info(f"Found {len(content)} content items for {url}")
|
66 |
return content
|
67 |
else:
|
|
|
78 |
if response.status == 200:
|
79 |
text = await response.text()
|
80 |
soup = BeautifulSoup(text, 'html.parser')
|
81 |
+
links = soup.find_all('a', href=True)
|
82 |
valid_links = []
|
83 |
+
for link in links:
|
84 |
+
full_url = urljoin(url, link['href'])
|
85 |
+
if full_url.startswith(base_url) and full_url != url:
|
86 |
+
valid_links.append(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
return valid_links
|
88 |
else:
|
89 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
|
|
106 |
visited.add(current_url)
|
107 |
start_time = time.time()
|
108 |
|
109 |
+
with get_db_connection() as conn:
|
110 |
+
c = conn.cursor()
|
111 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
112 |
+
result = c.fetchone()
|
113 |
+
|
114 |
+
if result:
|
115 |
+
content = eval(result[0]) # Convert string back to list
|
116 |
+
else:
|
117 |
+
content = await get_page_content(session, current_url)
|
118 |
+
with get_db_connection() as conn:
|
119 |
+
c = conn.cursor()
|
120 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
121 |
+
conn.commit()
|
122 |
+
|
123 |
all_pages.append((current_url, content))
|
124 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
125 |
|
126 |
if depth < max_depth:
|
127 |
links = await get_links(session, current_url, base_url)
|
128 |
for link in links:
|
129 |
+
if link not in visited:
|
130 |
to_visit.append((link, depth + 1))
|
131 |
|
132 |
return all_pages
|