Spaces:
Sleeping
Sleeping
File size: 2,384 Bytes
fd22419 0a72b65 fd22419 0787084 d95c7be fd22419 5840faa 72ee423 d95c7be fd22419 9c4d944 72ee423 9ecc246 836c9f1 72ee423 9c4d944 2d91f2a 03cd284 2d91f2a 03cd284 9c4d944 03cd284 d6edb5a fd22419 0a72b65 9082445 0a72b65 9082445 fd22419 32e14d1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import os
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_google_genai import GoogleGenerativeAIEmbeddings
# Import từ helpers
from helpers import (
list_docx_files, # Lấy danh sách file .docx
get_splits, # Xử lý file docx thành splits
get_json_splits_only, # Xử lý file JSON (FAQ)
get_web_documents, # Xử lý dữ liệu từ web
)
import json
def get_vectorstore():
### Xử lý tất cả các tài liệu và nhét vào database
folder_path = "syllabus_nct_word_format/"
docx_files = list_docx_files(folder_path)
all_splits = [] # Khởi tạo danh sách lưu kết quả
print("Feeding relevent websites' contents")
#
with open('syllabus_nct_word_format/urls.txt', 'r') as f:
base_urls = [line.strip() for line in f]
# urls_list
# base_urls =['https://fda.neu.edu.vn/hoi-nghi-khoa-hoc-cong-nghe-dai-hoc-kinh-te-quoc-dan-nam-2025/']
# ['https://nct.neu.edu.vn/', 'https://fsf.neu.edu.vn/', 'https://mfe.neu.edu.vn/', 'https://mis.neu.edu.vn/', 'https://fda.neu.edu.vn/', 'https://khoathongke.neu.edu.vn/', 'https://fit.neu.edu.vn/']
website_contents = get_web_documents(base_urls=base_urls)
all_splits += website_contents
print('Feeding .docx files')
for i, file_path in enumerate(tqdm(docx_files, desc="Đang xử lý", unit="file")):
output_json_path = f"output_{i}.json"
splits = get_splits(file_path, output_json_path)
all_splits += splits
print('Feeding .json files')
# Xử lý FAQ
FAQ_path = "syllabus_nct_word_format/FAQ.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
FAQ_path = "syllabus_nct_word_format/FAQ2.json"
FAQ_splits = get_json_splits_only(FAQ_path)
all_splits += FAQ_splits
# Lưu vào vectorstore với nhúng từ Google GenAI
# embedding = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
print('Get embedding model /paraphrase-multilingual-MiniLM-L12-v2')
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
print('Set vectorstore FAISS')
vectorstore = FAISS.from_documents(documents=all_splits, embedding=embedding)
return vectorstore
|