Spaces:
Runtime error
Runtime error
Update helpers.py
Browse files- helpers.py +56 -24
helpers.py
CHANGED
@@ -21,37 +21,69 @@ if "GOOGLE_API_KEY" not in os.environ:
|
|
21 |
os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
22 |
key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
23 |
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
docs = []
|
|
|
|
|
45 |
for page_url in urls:
|
46 |
try:
|
47 |
-
loader =
|
48 |
-
|
|
|
|
|
|
|
49 |
except Exception as e:
|
50 |
-
print(f"
|
51 |
|
52 |
-
|
|
|
|
|
53 |
return docs
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
def log_message(messages, filename="chat_log.txt"):
|
56 |
"""Ghi lịch sử tin nhắn vào file log"""
|
57 |
with open(filename, "a", encoding="utf-8") as f:
|
|
|
21 |
os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
22 |
key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
|
23 |
|
24 |
+
import asyncio
|
25 |
+
from urllib.parse import urljoin
|
26 |
+
from playwright.async_api import async_playwright
|
27 |
+
from langchain_community.document_loaders import AsyncHtmlLoader
|
28 |
+
from langchain.text_splitter import Html2TextTransformer
|
29 |
+
from tqdm.asyncio import tqdm
|
30 |
+
|
31 |
+
async def _fetch_urls(base_url):
|
32 |
+
"""Extract all links from a JavaScript-rendered webpage."""
|
33 |
+
async with async_playwright() as p:
|
34 |
+
browser = await p.chromium.launch(headless=True)
|
35 |
+
page = await browser.new_page()
|
36 |
+
await page.goto(base_url)
|
37 |
+
await page.wait_for_load_state("networkidle")
|
38 |
+
|
39 |
+
urls = set()
|
40 |
+
links = await page.locator("a").all()
|
41 |
+
for link in links:
|
42 |
+
href = await link.get_attribute("href")
|
43 |
+
if href and "#" not in href:
|
44 |
+
full_url = urljoin(base_url, href)
|
45 |
+
if full_url.startswith(base_url):
|
46 |
+
urls.add(full_url)
|
47 |
+
|
48 |
+
await browser.close()
|
49 |
+
return list(urls)
|
50 |
+
|
51 |
+
async def _fetch_web_content(urls):
|
52 |
+
"""Fetch HTML content and convert it to text, with a progress bar."""
|
53 |
docs = []
|
54 |
+
progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
|
55 |
+
|
56 |
for page_url in urls:
|
57 |
try:
|
58 |
+
loader = AsyncHtmlLoader(page_url)
|
59 |
+
html2text = Html2TextTransformer()
|
60 |
+
html = await loader.aload()
|
61 |
+
doc = html2text.transform_documents(html)
|
62 |
+
docs.extend(doc)
|
63 |
except Exception as e:
|
64 |
+
print(f"Error loading {page_url}: {e}")
|
65 |
|
66 |
+
progress_bar.update(1) # Update progress bar
|
67 |
+
|
68 |
+
progress_bar.close()
|
69 |
return docs
|
70 |
|
71 |
+
def scrape_website(base_urls):
|
72 |
+
"""
|
73 |
+
Scrapes a list of base URLs and extracts their content.
|
74 |
+
Includes a progress bar for tracking.
|
75 |
+
"""
|
76 |
+
async def _main():
|
77 |
+
all_urls = []
|
78 |
+
for base_url in base_urls:
|
79 |
+
urls = await _fetch_urls(base_url)
|
80 |
+
all_urls.extend(urls)
|
81 |
+
|
82 |
+
docs = await _fetch_web_content(all_urls)
|
83 |
+
return docs
|
84 |
+
|
85 |
+
return asyncio.run(_main())
|
86 |
+
|
87 |
def log_message(messages, filename="chat_log.txt"):
|
88 |
"""Ghi lịch sử tin nhắn vào file log"""
|
89 |
with open(filename, "a", encoding="utf-8") as f:
|