Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -55,13 +55,24 @@ async def get_page_content(session, url):
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
-
|
|
|
|
|
|
|
59 |
if main_content:
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
logger.info(f"Found {len(content)} content items for {url}")
|
66 |
return content
|
67 |
else:
|
@@ -70,7 +81,7 @@ async def get_page_content(session, url):
|
|
70 |
except Exception as e:
|
71 |
logger.error(f"Error processing {url}: {str(e)}")
|
72 |
return [f"Error processing {url}: {str(e)}"]
|
73 |
-
|
74 |
async def get_links(session, url, base_url):
|
75 |
try:
|
76 |
async with rate_limiter:
|
@@ -105,28 +116,14 @@ async def crawl_pages(base_url, max_depth):
|
|
105 |
continue
|
106 |
|
107 |
visited.add(current_url)
|
108 |
-
|
109 |
-
|
110 |
-
with get_db_connection() as conn:
|
111 |
-
c = conn.cursor()
|
112 |
-
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
113 |
-
result = c.fetchone()
|
114 |
-
|
115 |
-
if result:
|
116 |
-
content = eval(result[0]) # Convert string back to list
|
117 |
-
else:
|
118 |
-
content = await get_page_content(session, current_url)
|
119 |
-
with get_db_connection() as conn:
|
120 |
-
c = conn.cursor()
|
121 |
-
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
122 |
-
conn.commit()
|
123 |
|
|
|
124 |
all_pages.append((current_url, content))
|
125 |
-
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
126 |
|
127 |
if depth < max_depth:
|
128 |
-
|
129 |
-
for link in
|
130 |
if link not in visited and link not in [url for url, _ in to_visit]:
|
131 |
to_visit.append((link, depth + 1))
|
132 |
|
|
|
55 |
text = await response.text()
|
56 |
soup = BeautifulSoup(text, 'html.parser')
|
57 |
content = []
|
58 |
+
|
59 |
+
# Look for the main content area
|
60 |
+
main_content = soup.find('div', class_='toc')
|
61 |
+
|
62 |
if main_content:
|
63 |
+
# Extract section titles and links
|
64 |
+
for section in main_content.find_all('div', class_='toc--section'):
|
65 |
+
title = section.find('h2', class_='toc-title-border')
|
66 |
+
if title:
|
67 |
+
content.append(f"H2: {title.text.strip()}")
|
68 |
+
|
69 |
+
links_store = section.find('div', id=lambda x: x and x.startswith("{'type': 'links-store'"))
|
70 |
+
if links_store:
|
71 |
+
links = links_store.find_next('div', id=lambda x: x and x.startswith("{'type': 'links-show'"))
|
72 |
+
if links:
|
73 |
+
for link in links.find_all('a'):
|
74 |
+
content.append(f"LINK: {link.text.strip()} - {urljoin(url, link['href'])}")
|
75 |
+
|
76 |
logger.info(f"Found {len(content)} content items for {url}")
|
77 |
return content
|
78 |
else:
|
|
|
81 |
except Exception as e:
|
82 |
logger.error(f"Error processing {url}: {str(e)}")
|
83 |
return [f"Error processing {url}: {str(e)}"]
|
84 |
+
|
85 |
async def get_links(session, url, base_url):
|
86 |
try:
|
87 |
async with rate_limiter:
|
|
|
116 |
continue
|
117 |
|
118 |
visited.add(current_url)
|
119 |
+
logger.info(f"Crawling: {current_url} at depth {depth}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
+
content = await get_page_content(session, current_url)
|
122 |
all_pages.append((current_url, content))
|
|
|
123 |
|
124 |
if depth < max_depth:
|
125 |
+
new_links = [link.split(' - ')[1] for link in content if link.startswith('LINK:')]
|
126 |
+
for link in new_links:
|
127 |
if link not in visited and link not in [url for url, _ in to_visit]:
|
128 |
to_visit.append((link, depth + 1))
|
129 |
|