quoc-khanh commited on
Commit
983f039
·
verified ·
1 Parent(s): 3db1109

Update helpers.py

Browse files
Files changed (1) hide show
  1. helpers.py +56 -24
helpers.py CHANGED
@@ -21,37 +21,69 @@ if "GOOGLE_API_KEY" not in os.environ:
21
  os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
22
  key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
23
 
24
- def get_web_documents(base_url='https://nct.neu.edu.vn/'):
25
- """Lấy tất cả URL từ trang web và trích xuất nội dung văn bản"""
26
-
27
- # Bước 1: Lấy danh sách URL
28
- response = requests.get(base_url)
29
- if response.status_code != 200:
30
- print(f"Không thể truy cập {base_url}")
31
- return []
32
-
33
- soup = BeautifulSoup(response.text, 'html.parser')
34
-
35
- urls = set() # Dùng set để tránh trùng lặp
36
- for link in soup.find_all('a', href=True):
37
- href = link.get('href')
38
- if href.startswith(base_url): # Chỉ lấy URL nội bộ
39
- urls.add(href)
40
-
41
- print(f"Tìm thấy {len(urls)} URL hợp lệ.")
42
-
43
- # Bước 2: Tải nội dung từ các URL
 
 
 
 
 
 
 
 
 
44
  docs = []
 
 
45
  for page_url in urls:
46
  try:
47
- loader = WebBaseLoader(web_paths=[page_url])
48
- docs.extend(loader.load()) # Đồng bộ, không cần async
 
 
 
49
  except Exception as e:
50
- print(f"Lỗi khi tải {page_url}: {e}")
51
 
52
- print(f"Tải thành công {len(docs)} trang.")
 
 
53
  return docs
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def log_message(messages, filename="chat_log.txt"):
56
  """Ghi lịch sử tin nhắn vào file log"""
57
  with open(filename, "a", encoding="utf-8") as f:
 
21
  os.environ["GOOGLE_API_KEY"] = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
22
  key = "AIzaSyDJ4vIKuIBIPNHATLxnoHlagXWbsAz-vRs"
23
 
24
+ import asyncio
25
+ from urllib.parse import urljoin
26
+ from playwright.async_api import async_playwright
27
+ from langchain_community.document_loaders import AsyncHtmlLoader
28
+ from langchain.text_splitter import Html2TextTransformer
29
+ from tqdm.asyncio import tqdm
30
+
31
+ async def _fetch_urls(base_url):
32
+ """Extract all links from a JavaScript-rendered webpage."""
33
+ async with async_playwright() as p:
34
+ browser = await p.chromium.launch(headless=True)
35
+ page = await browser.new_page()
36
+ await page.goto(base_url)
37
+ await page.wait_for_load_state("networkidle")
38
+
39
+ urls = set()
40
+ links = await page.locator("a").all()
41
+ for link in links:
42
+ href = await link.get_attribute("href")
43
+ if href and "#" not in href:
44
+ full_url = urljoin(base_url, href)
45
+ if full_url.startswith(base_url):
46
+ urls.add(full_url)
47
+
48
+ await browser.close()
49
+ return list(urls)
50
+
51
+ async def _fetch_web_content(urls):
52
+ """Fetch HTML content and convert it to text, with a progress bar."""
53
  docs = []
54
+ progress_bar = tqdm(total=len(urls), desc="Scraping Pages", unit="page")
55
+
56
  for page_url in urls:
57
  try:
58
+ loader = AsyncHtmlLoader(page_url)
59
+ html2text = Html2TextTransformer()
60
+ html = await loader.aload()
61
+ doc = html2text.transform_documents(html)
62
+ docs.extend(doc)
63
  except Exception as e:
64
+ print(f"Error loading {page_url}: {e}")
65
 
66
+ progress_bar.update(1) # Update progress bar
67
+
68
+ progress_bar.close()
69
  return docs
70
 
71
+ def scrape_website(base_urls):
72
+ """
73
+ Scrapes a list of base URLs and extracts their content.
74
+ Includes a progress bar for tracking.
75
+ """
76
+ async def _main():
77
+ all_urls = []
78
+ for base_url in base_urls:
79
+ urls = await _fetch_urls(base_url)
80
+ all_urls.extend(urls)
81
+
82
+ docs = await _fetch_web_content(all_urls)
83
+ return docs
84
+
85
+ return asyncio.run(_main())
86
+
87
  def log_message(messages, filename="chat_log.txt"):
88
  """Ghi lịch sử tin nhắn vào file log"""
89
  with open(filename, "a", encoding="utf-8") as f: